1. Install LDna and load the libraries

# Load devtools
library(devtools)

# Install LDna from GitHub
devtools::install_github("petrikemppainen/LDna", ref = 'v2.0')

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  544650 29.1    1221510 65.3         NA   669265 35.8
## Vcells 1021806  7.8    8388608 64.0      32768  1840446 14.1

Restart the R session if needed

.rs.restartR()

Load the libraries

library(tidyverse)
library(here)
library(colorout)
library(dplyr)
library(flextable)
library(ggplot2)
library(scales)
library(reticulate)
library(extrafont)
library(stringr)
library(flextable)
library(officer)
library(ggrepel)
library(Cairo)
library(LDna)
library(reshape2)
library(viridis)
library(igraph)
library(writexl)

2. Data tyding

We will have to estimate LD with Plink and prepare the data to use it with LDna. The first step is to decided with SNP set we will use. For example, we did LD pruning for some analysis, so we should not use those files to start the linkage network analysis. We also have to decided about the minor allele frequency threshold we will use.

We can remove the individuals that failed some of our tests. One mosquito, AUT 399, failed the heterozygosity test. None failed the relatedness test, and 867 SNPs failed the HWE test. We can start with file4 from the quality control. We can remove the mosquito and the SNPs that did not pass the HWE test.

Create file to remove mosquito

echo "AUT 399" > output/quality_control/remove_aut_399.txt

Remove mosquito and SNPs

plink2 \
--allow-extra-chr \
--bfile output/quality_control/file4 \
--remove output/quality_control/remove_aut_399.txt \
--extract output/quality_control/passed_hwe.txt \
--make-bed \
--out output/ldna/files/file1 \
--silent;

grep "samples\|variants" output/ldna/files/file1.log
## 61 samples (25 females, 26 males, 10 ambiguous; 61 founders) loaded from
## 111220 variants loaded from output/quality_control/file4.bim.
## --extract: 110353 variants remaining.
## --remove: 60 samples remaining.
## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) remaining after
## 110353 variants remaining after main filters.

The next step is to create the chromosomal scale for this data set. Check the bim file now to see that it is using the scaffold scale

head output/ldna/files/file1.bim
## 1.1  AX-581444870    0   97856   C   T
## 1.1  AX-583035083    0   305518  A   G
## 1.1  AX-583035102    0   308124  A   G
## 1.1  AX-583033342    0   315059  C   G
## 1.1  AX-583035163    0   315386  A   G
## 1.1  AX-583033356    0   315674  C   T
## 1.1  AX-583033370    0   330057  G   A
## 1.1  AX-583035194    0   330265  A   G
## 1.1  AX-583035198    0   330908  G   T
## 1.1  AX-583033387    0   331288  C   T

2.1 Create chromosomal scale

Import the .bim file with the SNPs to create a new chromosomal scale.

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
snps <- import_bim(here("output", "ldna", "files", "file1.bim"))

# Check it
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-581444870     0    97856 C       T      
## 2 1.1      AX-583035083     0   305518 A       G      
## 3 1.1      AX-583035102     0   308124 A       G      
## 4 1.1      AX-583033342     0   315059 C       G      
## 5 1.1      AX-583035163     0   315386 A       G      
## 6 1.1      AX-583033356     0   315674 C       T

Separate the tibbles into each chromosome.

#   ____________________________________________________________________________
#   separate the SNP data per chromosome                                    ####
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()

Import the file with sizes of each scaffold.

#   ____________________________________________________________________________
#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "data", "genome", "scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   ____________________________________________________________________________
#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   ____________________________________________________________________________
#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   ____________________________________________________________________________
#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   ____________________________________________________________________________
#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )
# check it
head(chroms)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm midPos_fullseq Allele1 Allele2
##   <chr>      <chr>        <int>          <dbl> <chr>   <chr>  
## 1 1          AX-581444870     0          97856 C       T      
## 2 1          AX-583035083     0         305518 A       G      
## 3 1          AX-583035102     0         308124 A       G      
## 4 1          AX-583033342     0         315059 C       G      
## 5 1          AX-583035163     0         315386 A       G      
## 6 1          AX-583033356     0         315674 C       T

Save the new .bim file

#   ____________________________________________________________________________
#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here(
    "output", "ldna", "files", "file1B.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Rename the .bim files

# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv output/ldna/files/file1.bim output/ldna/files/file1_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv output/ldna/files/file1B.bim output/ldna/files/file1.bim

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

plink2 \
--bfile output/ldna/files/file1 \
--make-bed \
--out output/ldna/test01;
# then we remove the files 
rm output/ldna/test01.*
## PLINK v2.00a3.3 64-bit (3 Jun 2022)            www.cog-genomics.org/plink/2.0/
## (C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
## Logging to output/ldna/test01.log.
## Options in effect:
##   --bfile output/ldna/files/file1
##   --make-bed
##   --out output/ldna/test01
## 
## Start time: Thu Aug 15 15:02:36 2024
## 32768 MiB RAM detected; reserving 16384 MiB for main workspace.
## Using up to 12 threads (change this with --threads).
## 60 samples (24 females, 26 males, 10 ambiguous; 60 founders) loaded from
## output/ldna/files/file1.fam.
## 110353 variants loaded from output/ldna/files/file1.bim.
## 1 binary phenotype loaded (28 cases, 22 controls).
## Writing output/ldna/test01.fam ... done.
## Writing output/ldna/test01.bim ... done.
## Writing output/ldna/test01.bed ... 0%59%done.
## End time: Thu Aug 15 15:02:36 2024

No warnings from Plink2. Now, we can go ahead with our analysis.

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2517714 134.5    4907427 262.1         NA  3822528 204.2
## Vcells 4421247  33.8   11142633  85.1      32768 11142592  85.1

2.2 Subset by family

Estimate frequency

plink \
--keep-allele-order \
--bfile output/ldna/files/file1 \
--make-bed \
--freqx \
--out output/ldna/files/frq \
--silent;

grep "people\|variants" output/ldna/files/frq.log
## 110353 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## 110353 variants and 60 people pass filters and QC.

Now we can use bash to check if there SNPs with low heterozygosity

# count SNPs with het < 0.5 (we have 60 mosquitoes)
cat output/ldna/files/frq.frqx | awk '{ if ($6 <= 30) print }' | wc -l
##   104625

How many SNPs

110353 - 104625
## [1] 5728

We can remove this SNPs since a limiting factor for the linkage network analysis is memory

Create a file with the SNPs

# get list of snps
cat output/ldna/files/frq.frqx | awk '{ if ($6 <= 30) print }' | awk '{print $2}' > output/ldna/files/snps_het.txt

We can start our analysis with MAF of 5%. If we have memory issues we can use 10% next. We will also set the genotyping missingness to zero.

# Filter MAF 5% and do not allow missing genotypes
plink \
--keep-allele-order \
--bfile output/ldna/files/file1 \
--out output/ldna/files/file2 \
--maf 0.05 \
--geno 0 \
--make-bed \
--extract output/ldna/files/snps_het.txt \
--silent;

grep "people\|variants" output/ldna/files/file2.log
## 110353 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --extract: 104625 variants remaining.
## 49360 variants removed due to missing genotype data (--geno).
## 5502 variants removed due to minor allele threshold(s)
## 49763 variants and 60 people pass filters and QC.

Check the fam file families

# get the list of families
cat output/ldna/files/file2.fam | awk '!seen[$1]++' | awk '{print $1}'
## MAN
## AUT
## NEW

We can check how many mosquitoes per population as well

awk '{print $1}' output/ldna/files/file2.fam | sort | uniq -c | awk '{print $2, $1}' 
## AUT 28
## MAN 10
## NEW 22

I did the analysis using the 3 populations, however the results of MAN were odd because we have only 10 mosquitoes. Therefore, I decided not to do linkage network analysis with MAN. I finished the analysis to see the odd pattern, but I came back and changed the code from this part foward to include only AUT and NEW for the linkage network analysis. I will still create files for each family, however the I will use only the shared SNPs between NEW and AUT. We will not consider MAN. I left it in the code, but we will not use it for comparisons.

Create a file for each of them

# make a text file with the name of each population
for pop in $(cat output/ldna/files/file2.fam | awk '!seen[$1]++' | awk '{print $1}');
do
  echo $pop > output/ldna/files/$pop\.txt
done

Now use Plink to create a file for each family. Now we set a MAF threshold of 5% within each family

# use plink to create a plink file for each population
for pop in $(cat output/ldna/files/file2.fam | awk '!seen[$1]++' | awk '{print $1}');
do
  plink --keep-allele-order --allow-no-sex --bfile output/ldna/files/file2 --make-bed --keep-fam output/ldna/files/$pop\.txt --out output/ldna/files/$pop --geno 0 --maf 0.05 --silent
done

Now we can check the number of SNPs and samples in each file

AUT

grep "people\|variants" output/ldna/files/AUT.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --keep-fam: 28 people remaining.
## 0 variants removed due to missing genotype data (--geno).
## 12495 variants removed due to minor allele threshold(s)
## 37268 variants and 28 people pass filters and QC.

MAN

grep "people\|variants" output/ldna/files/MAN.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --keep-fam: 10 people remaining.
## 0 variants removed due to missing genotype data (--geno).
## 2505 variants removed due to minor allele threshold(s)
## 47258 variants and 10 people pass filters and QC.

NEW

grep "people\|variants" output/ldna/files/NEW.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --keep-fam: 22 people remaining.
## 0 variants removed due to missing genotype data (--geno).
## 4208 variants removed due to minor allele threshold(s)
## 45555 variants and 22 people pass filters and QC.

We can see that the number of SNPs kept is not the same. We need to get the shared SNPs. We can import the bim files and use R to get the intersect

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
AUT <- import_bim(here("output", "ldna", "files", "AUT.bim"))
MAN <- import_bim(here("output", "ldna", "files", "MAN.bim")) # not using MAN because it has only 10 mosquitoes
NEW <- import_bim(here("output", "ldna", "files", "NEW.bim"))

Now get the shared SNPs

# Identify common SNPs
# common_snps <- Reduce(intersect, list(AUT$SNP, MAN$SNP, NEW$SNP))
common_snps <- Reduce(intersect, list(AUT$SNP, NEW$SNP))

# Create a data frame with the common SNPs
common_snps_df <- data.frame(SNP = common_snps)

# Count them
length(common_snps)
## [1] 33836
# Write the data frame to a file
write.table(common_snps_df, here("output", "ldna", "files","common_snps.txt"), quote = FALSE, row.names = FALSE, col.names = FALSE)

We have 32,732 SNPs. It seems we remove a lot but still too much for LDna in a laptop. We can use the cluster but first I will try using a laptop.

Now we have to repeat the previous step when we create a file for each family but now using only the SNPs that have 5% MAF in each family

# use plink to create a plink file for each population
for pop in $(cat output/ldna/files/file2.fam | awk '!seen[$1]++' | awk '{print $1}');
do
  plink --keep-allele-order --allow-no-sex --bfile output/ldna/files/file2 --make-bed --keep-fam output/ldna/files/$pop\.txt --out output/ldna/files/$pop --extract output/ldna/files/common_snps.txt --silent
done

Now we can check the number of SNPs and they all should be the same

AUT

grep "people\|variants" output/ldna/files/AUT.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --extract: 33836 variants remaining.
## --keep-fam: 28 people remaining.
## 33836 variants and 28 people pass filters and QC.

MAN

grep "people\|variants" output/ldna/files/MAN.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --extract: 33836 variants remaining.
## --keep-fam: 10 people remaining.
## 33836 variants and 10 people pass filters and QC.

NEW

grep "people\|variants" output/ldna/files/NEW.log
## 49763 variants loaded from .bim file.
## 60 people (26 males, 24 females, 10 ambiguous) loaded from .fam.
## --extract: 33836 variants remaining.
## --keep-fam: 22 people remaining.
## 33836 variants and 22 people pass filters and QC.

Perfect. Now we can split the data by chromosome for each family.

2.3 Subset by chromosome within each family

We can create a new directory

# Create a directory
mkdir -p output/ldna/pop;

# We can copy the family files there
cp output/ldna/files/AUT* output/ldna/pop;
cp output/ldna/files/MAN* output/ldna/pop;
cp output/ldna/files/NEW* output/ldna/pop;

# remove the files we dont need
rm output/ldna/pop/*.txt;
rm output/ldna/pop/*.nosex

We can now get a list of SNPs for each chromosome. All the populations have the same SNPs, so we can use any of the files.

cat output/ldna/pop/AUT.bim | awk '$1 == 1' | awk '{print $2}' > output/ldna/pop/chr1_snps.txt;
cat output/ldna/pop/AUT.bim | awk '$1 == 2' | awk '{print $2}' > output/ldna/pop/chr2_snps.txt;
cat output/ldna/pop/AUT.bim | awk '$1 == 3' | awk '{print $2}' > output/ldna/pop/chr3_snps.txt

2.5 Prepare LD matrices

Prepare files for LDna

# chr1
cat output/ldna/pop/chr1_snps.txt | awk '{print $1}' > output/ldna/pop/chr1/snps1.txt; 
echo "" > output/ldna/pop/chr1/snps2.txt; 
cat output/ldna/pop/chr1/snps2.txt output/ldna/pop/chr1/snps1.txt | gzip -9 > output/ldna/pop/chr1/snps3.txt.gz; 
cat output/ldna/pop/chr1/snps1.txt | tr '\n' ' ' |  awk -v OFS='\t' '{$1=$1}1' | gzip -9 > output/ldna/pop/chr1/header.txt.gz;

# chr2
cat output/ldna/pop/chr2_snps.txt | awk '{print $1}' > output/ldna/pop/chr2/snps1.txt; 
echo "" > output/ldna/pop/chr2/snps2.txt; 
cat output/ldna/pop/chr2/snps2.txt output/ldna/pop/chr2/snps1.txt | gzip -9 > output/ldna/pop/chr2/snps3.txt.gz; 
cat output/ldna/pop/chr2/snps1.txt | tr '\n' ' ' |  awk -v OFS='\t' '{$1=$1}1' | gzip -9 > output/ldna/pop/chr2/header.txt.gz;

# chr3
cat output/ldna/pop/chr3_snps.txt | awk '{print $1}' > output/ldna/pop/chr3/snps1.txt; 
echo "" > output/ldna/pop/chr3/snps2.txt;
cat output/ldna/pop/chr3/snps2.txt output/ldna/pop/chr3/snps1.txt | gzip -9 > output/ldna/pop/chr3/snps3.txt.gz; 
cat output/ldna/pop/chr3/snps1.txt | tr '\n' ' ' |  awk -v OFS='\t' '{$1=$1}1' | gzip -9 > output/ldna/pop/chr3/header.txt.gz;

Add header to the matrices

# chr1
for pop in $(ls -1 output/ldna/pop/chr1/*.ld.gz | sed 's/output\/ldna\/pop\/chr1\///' | sed 's/\.[^.]*$//');
do
  cat output/ldna/pop/chr1/header.txt.gz output/ldna/pop/chr1/$pop\.gz > output/ldna/pop/chr1/$pop\.txt.gz
done;

# chr2
for pop in $(ls -1 output/ldna/pop/chr2/*.ld.gz | sed 's/output\/ldna\/pop\/chr2\///' | sed 's/\.[^.]*$//');
do
  cat output/ldna/pop/chr2/header.txt.gz output/ldna/pop/chr2/$pop\.gz > output/ldna/pop/chr2/$pop\.txt.gz
done;

# chr3
for pop in $(ls -1 output/ldna/pop/chr3/*.ld.gz | sed 's/output\/ldna\/pop\/chr3\///' | sed 's/\.[^.]*$//');
do
  cat output/ldna/pop/chr3/header.txt.gz output/ldna/pop/chr3/$pop\.gz > output/ldna/pop/chr3/$pop\.txt.gz
done

We can rename the files

# rename the files (remove the ld)
rename --force 's/ld.txt.gz/txt.gz/' output/ldna/pop/chr1/*ld.txt.gz;
rename --force 's/ld.txt.gz/txt.gz/' output/ldna/pop/chr2/*ld.txt.gz;
rename --force 's/ld.txt.gz/txt.gz/' output/ldna/pop/chr3/*ld.txt.gz

Now we can add the row names (this takes time and the output files are near 1Gb)

# chr1
for pop in $(ls -1 output/ldna/pop/chr1/*.chr1.txt.gz | sed 's/output\/ldna\/pop\/chr1\///' | sed 's/\.chr1\.txt\.gz$//');
do
  paste <(gzip -d < output/ldna/pop/chr1/snps3.txt.gz) <(gzip -d < output/ldna/pop/chr1/$pop.chr1.txt.gz) > output/ldna/pop/chr1/${pop}.chr1.txt
done

# chr2
for pop in $(ls -1 output/ldna/pop/chr2/*.chr2.txt.gz | sed 's/output\/ldna\/pop\/chr2\///' | sed 's/\.chr2\.txt\.gz$//');
do
  paste <(gzip -d < output/ldna/pop/chr2/snps3.txt.gz) <(gzip -d < output/ldna/pop/chr2/$pop.chr2.txt.gz) > output/ldna/pop/chr2/${pop}.chr2.txt
done

# chr3
for pop in $(ls -1 output/ldna/pop/chr3/*.chr3.txt.gz | sed 's/output\/ldna\/pop\/chr3\///' | sed 's/\.chr3\.txt\.gz$//');
do
  paste <(gzip -d < output/ldna/pop/chr3/snps3.txt.gz) <(gzip -d < output/ldna/pop/chr3/$pop.chr3.txt.gz) > output/ldna/pop/chr3/${pop}.chr3.txt
done

I deleted the MAN files for chromosome 2 and 3 since we will not estimate LD for them. I kept chromosome 1 to show how small sample size bias the results.

3. LDna chromosome 1

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2518970 134.6    4907427 262.1         NA  3822528 204.2
## Vcells 4425377  33.8   11142633  85.1      32768 11142592  85.1

3.1 NEW

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2518450 134.5    4907427 262.1         NA  3822528 204.2
## Vcells 4424566  33.8   11142633  85.1      32768 11142592  85.1

Import the data

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr1", "NEW.chr1.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=6, method = "single")

Save ldna

saveRDS(ldna, file = here("output", "ldna", "pop", "chr1", "NEW.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr1", "NEW_ld2.rds"))

Read the data

ldna <- readRDS(file = here("output", "ldna", "pop", "chr1", "NEW.rds"))
ld2 <- readRDS(file = here("output", "ldna", "pop", "chr1", "NEW_ld2.rds"))

Check the object

str(ldna)
## List of 4
##  $ clusterfile: logi [1:7101, 1:2580] TRUE TRUE TRUE TRUE TRUE TRUE ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7101] "AX-583033342" "AX-583035163" "AX-583035194" "AX-583035257" ...
##   .. ..$ : chr [1:2580] "2580_0.37" "2579_0.38" "2578_0.4" "2577_0.42" ...
##  $ stats      :'data.frame': 9681 obs. of  6 variables:
##   ..$ cluster       : chr [1:9681] "2580_0.37" "2579_0.38" "2578_0.4" "2577_0.42" ...
##   ..$ parent_cluster: chr [1:9681] "root" "2580_0.37" "2579_0.38" "2578_0.4" ...
##   ..$ distance      : chr [1:9681] "0.37" "0.01" "0.02" "0.02" ...
##   ..$ nV            : num [1:9681] 7101 7100 7099 7098 7094 ...
##   ..$ nE            : num [1:9681] 963232 882691 754526 586488 548570 ...
##   ..$ lambda        : num [1:9681] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tree       :List of 4
##   ..$ edge       : int [1:9680, 1:2] 7102 7102 7103 7103 7104 7104 7105 7105 7105 7105 ...
##   ..$ edge.length: num [1:9680] 0.315 0.005 0.31 0.01 0.3 0.01 0.29 0.29 0.29 0.29 ...
##   ..$ tip.label  : chr [1:7101] "AX-583033342" "AX-583035163" "AX-583035194" "AX-583035257" ...
##   ..$ Nnode      : int 2580
##   ..- attr(*, "class")= chr "phylo"
##   ..- attr(*, "order")= chr "cladewise"
##  $ lambda_min :Classes 'data.table' and 'data.frame':    2579 obs. of  2 variables:
##   ..$ V1: num [1:2579] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.52 ...
##   ..$ V2: num [1:2579] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>
str(ld2)
##  num [1:7101, 1:7101] NA 0.00552 0.00107 0.08495 0.0082 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:7101] "AX-583033342" "AX-583035163" "AX-583035194" "AX-583035257" ...
##   ..$ : chr [1:7101] "AX-583033342" "AX-583035163" "AX-583035194" "AX-583035257" ...

We can use the same parameters that we used for the other population

# snp_count <- ncol(ld2)
edges_max <- 100
edges_min <- 20

We can test different number of edges. Here we set lambda.lim to 1. We use branch.traversal=TRUE so the clusters are extracted at the base of the branch

pdf(file = here("output", "ldna", "pop", "chr1", paste(edges_min, "1.NEW_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim = 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr1", "summary_NEW.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr1", "summary_NEW.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off() 

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)

Create an objet with the clusters and the SNPs in each cluster

max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr1", "NEW_clusters_snps.rds"))
head(df)
##          304_1    1146_0.89    1317_0.88    1462_0.87    1513_0.87    1554_0.86
## 1 AX-583225844 AX-583089865 AX-583047632 AX-583085036 AX-583086832 AX-583062507
## 2 AX-583384810 AX-583091075 AX-583590642 AX-583093009 AX-583143774 AX-583080039
## 3 AX-583470080 AX-583096998 AX-583611562 AX-583093038 AX-583145470 AX-583114005
## 4 AX-583775234 AX-583103773 AX-583614130 AX-583099231 AX-583185336 AX-583139149
## 5 AX-583819418 AX-583104196 AX-583622917 AX-583194559 AX-583628542 AX-583139336
## 6 AX-583823698 AX-583224825 AX-583795432 AX-583203776 AX-583669478 AX-583156863
##      1555_0.86    1557_0.86    1560_0.86    1562_0.86    1677_0.85    1679_0.85
## 1 AX-583056906 AX-583137134 AX-583078630 AX-583090252 AX-583056483 AX-583057473
## 2 AX-583091991 AX-583977423 AX-583090213 AX-583100381 AX-583086801 AX-583061190
## 3 AX-583093718 AX-584029922 AX-583097076 AX-583170444 AX-583084951 AX-583189051
## 4 AX-583128031 AX-584030822 AX-583099042 AX-583215329 AX-583087814 AX-583215752
## 5 AX-583136877 AX-584034279 AX-583107264 AX-583355113 AX-583090453 AX-583217478
## 6 AX-583143595 AX-584044389 AX-583107709 AX-583596085 AX-583088413 AX-583237040
##      1748_0.85    1782_0.84    1787_0.84    1788_0.84    1793_0.84    1794_0.84
## 1 AX-583081545 AX-583041570 AX-583082540 AX-583082465 AX-583088401 AX-583081815
## 2 AX-583090093 AX-583090060 AX-583080693 AX-583086702 AX-583099891 AX-583088766
## 3 AX-583602373 AX-583098533 AX-583081296 AX-583087917 AX-583110004 AX-583128868
## 4 AX-583609227 AX-583106307 AX-583092936 AX-583088027 AX-583108624 AX-583132797
## 5 AX-583975361 AX-583109098 AX-583093324 AX-583088098 AX-583111691 AX-583184806
## 6 AX-584084807 AX-583128119 AX-583119352 AX-583090403 AX-583114975 AX-583547283
##      1799_0.84    1800_0.84    1825_0.84    1826_0.84    1844_0.84    1872_0.83
## 1 AX-583084494 AX-583081757 AX-583091315 AX-583321304 AX-583080669 AX-583081551
## 2 AX-583086342 AX-583084106 AX-583097298 AX-583665849 AX-583089803 AX-583084981
## 3 AX-583092423 AX-583084830 AX-583100356 AX-583833741 AX-583109443 AX-583088191
## 4 AX-583097705 AX-583091490 AX-583114039 AX-583850886 AX-583116696 AX-583090304
## 5 AX-583400294 AX-583094493 AX-583127716 AX-583993115 AX-583117803 AX-583095200
## 6 AX-583655797 AX-583098577 AX-583147515 AX-583989372 AX-583119179 AX-583100257
##      1876_0.83    1878_0.83    1880_0.83    1881_0.83    1964_0.82    2028_0.81
## 1 AX-583083945 AX-583087248 AX-583083567 AX-583093000 AX-583088409 AX-583082199
## 2 AX-583085403 AX-583193418 AX-583085757 AX-583093225 AX-583099339 AX-583085669
## 3 AX-583087341 AX-583394808 AX-583094024 AX-583138067 AX-583121954 AX-583084487
## 4 AX-583088758 AX-583546872 AX-583105051 AX-583155399 AX-583134350 AX-583085772
## 5 AX-583093591 AX-583661608 AX-583143944 AX-583299839 AX-583133017 AX-583287535
## 6 AX-583096466 AX-583665383 AX-583144077 AX-583818469 AX-583133110 AX-583359637
##      2031_0.81     2102_0.8
## 1 AX-583102108 AX-583098517
## 2 AX-583289977 AX-583119290
## 3 AX-583293711 AX-583117244
## 4 AX-583357952 AX-583118579
## 5 AX-583975226 AX-583126857
## 6 AX-583979052 AX-583129900

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##   Cluster          SNP
## 1   304_1 AX-583225844
## 2   304_1 AX-583384810
## 3   304_1 AX-583470080
## 4   304_1 AX-583775234
## 5   304_1 AX-583819418
## 6   304_1 AX-583823698
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr1", "NEW_clusters_snps1.rds"))

Make long format

new_chr1 <- melt(do, na.rm = T, value.name = "value")
head(new_chr1)
##   Var1  Var2        value
## 1    1 304_1 AX-583225844
## 2    2 304_1 AX-583384810
## 3    3 304_1 AX-583470080
## 4    4 304_1 AX-583775234
## 5    5 304_1 AX-583819418
## 6    6 304_1 AX-583823698

Update names

colnames(new_chr1)<- c("v1", "cluster", "SNP")
head(new_chr1)
##   v1 cluster          SNP
## 1  1   304_1 AX-583225844
## 2  2   304_1 AX-583384810
## 3  3   304_1 AX-583470080
## 4  4   304_1 AX-583775234
## 5  5   304_1 AX-583819418
## 6  6   304_1 AX-583823698

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
new_snps_chr1 <- import_bim(here("output", "ldna", "pop", "NEW.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(new_snps_chr1)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(new_chr1, new_snps_chr1, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP v1   cluster Chromosome Position
## 1 AX-583041570  1 1782_0.84          1  1741057
## 2 AX-583047632  1 1317_0.88          1  2776876
## 3 AX-583056483  1 1677_0.85          1 10957198
## 4 AX-583056906  1 1555_0.86          1 11046435
## 5 AX-583057473  1 1679_0.85          1 11090350
## 6 AX-583061190  2 1679_0.85          1 11359739

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##     cluster          SNP Position
## 1 1782_0.84 AX-583041570  1741057
## 2 1317_0.88 AX-583047632  2776876
## 3 1677_0.85 AX-583056483 10957198
## 4 1555_0.86 AX-583056906 11046435
## 5 1679_0.85 AX-583057473 11090350
## 6 1679_0.85 AX-583061190 11359739

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##     cluster          SNP Position
## 1 1782_0.84 AX-583041570  1741057
## 2 1317_0.88 AX-583047632  2776876
## 3 1677_0.85 AX-583056483 10957198
## 4 1555_0.86 AX-583056906 11046435
## 5 1679_0.85 AX-583057473 11090350
## 6 1679_0.85 AX-583061190 11359739
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##     cluster Position
## 1 1782_0.84  1741057
## 2 1317_0.88  2776876
## 3 1677_0.85 10957198
## 4 1555_0.86 11046435
## 5 1679_0.85 11090350
## 6 1679_0.85 11359739

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr1", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr1", "SNPs_clusters_NEW_chr1.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr1", "clusters", "sushi_NEW_chr1.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr1/clusters/sushi_NEW_chr1.txt > output/ldna/pop/chr1/clusters/sushi2_NEW_chr1.txt;
head output/ldna/pop/chr1/clusters/sushi2_NEW_chr1.txt
## Cluster  Start   End Size
## 1782_0.84    1741057 1741057 0
## 1317_0.88    2776876 2776876 0
## 1677_0.85    10957198    10957198    0
## 1555_0.86    11046435    11046435    0
## 1679_0.85    11090350    11359739    269389
## 1554_0.86    11618085    16238366    4620281
## 1844_0.84    16307511    16307511    0
## 1560_0.86    16309375    16309375    0
## 1748_0.85    16392346    16392346    0

Count how many segments

# Define the path
input_path <- here("output", "ldna", "pop", "chr1", "clusters", "sushi2_NEW_chr1.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')

# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 1146_0.89        62
## 2 1317_0.88        17
## 3 1462_0.87        26
## 4 1513_0.87        31
## 5 1554_0.86        41
## 6 1555_0.86        44

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr1",
       "clusters",
       "sushi2_NEW_chr1.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 1 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 4, title.position = "top", title.hjust = 0.5))

# # Use ggsave to save the plot as a PDF
# ggsave(
#   filename = here("output", "ldna", "pop", "chr1", "clusters", "NEW_chr1.pdf"),
#   device = "pdf",
#   width = 8,
#   height = 5,
#   units = "in"
# )

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 1 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 5, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr1", "clusters", "NEW_chr1.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can save the data to plot all chromosomes together using a facet plot

head(annotated_data)
##     Cluster     Start       End    Size Start_Mb   End_Mb nSegments
## 1 1146_0.89 226898202 228109207 1211005 226.8982 228.1092        62
## 2 1146_0.89 122806202 123508333  702131 122.8062 123.5083        62
## 3 1146_0.89 122152806 122442788  289982 122.1528 122.4428        62
## 4 1146_0.89 344173648 344497057  323409 344.1736 344.4971        62
## 5 1317_0.88 169179646 170784441 1604795 169.1796 170.7844        17
## 6 1462_0.87 254240322 254620755  380433 254.2403 254.6208        26
##   Cluster_with_SNPs
## 1    1146_0.89 (62)
## 2    1146_0.89 (62)
## 3    1146_0.89 (62)
## 4    1146_0.89 (62)
## 5    1317_0.88 (17)
## 6    1462_0.87 (26)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments    Start      End    Size Start_Mb
## 1          1 1679_0.85 0.85        39 11090350 11359739  269389 11.09035
## 2          1 1554_0.86 0.86        41 11618085 16238366 4620281 11.61809
## 3          1 1872_0.83 0.83        77 30793588 30946077  152489 30.79359
## 4          1 1788_0.84 0.84       266 31157406 31305550  148144 31.15741
## 5          1 1677_0.85 0.85       116 32809265 32964732  155467 32.80927
## 6          1 1800_0.84 0.84       103 35740120 36594644  854524 35.74012
##     End_Mb
## 1 11.35974
## 2 16.23837
## 3 30.94608
## 4 31.30555
## 5 32.96473
## 6 36.59464
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr1", "NEW_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr1",
       "summary_NEW.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(1, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("NEW", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs
## 1          1 1679_0.85 0.85 11090350 11359739        39    41
## 2          1 1554_0.86 0.86 11618085 16238366        41    50
## 3          1 1872_0.83 0.83 30793588 30946077        77    95
## 4          1 1788_0.84 0.84 31157406 31305550       266   353
## 5          1 1677_0.85 0.85 32809265 32964732       116   137
## 6          1 1800_0.84 0.84 35740120 36594644       103   121
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr1", "NEW_plot2.rds"))

3.2 AUT

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells 2734259 146.1    4907427  262.1         NA   4907427  262.1
## Vcells 4876359  37.3  196279913 1497.5      32768 306685852 2339.9

Import the data

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr1", "AUT.chr1.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=4, method = "single")

Check the object

str(ldna)

Save it

saveRDS(ldna, file = here("output", "ldna", "pop", "chr1", "AUT.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr1", "AUT_ld2.rds"))

Import data

ldna <- readRDS(here("output", "ldna", "pop", "chr1", "AUT.rds"))
ld2 <- readRDS(here("output", "ldna", "pop", "chr1", "AUT_ld2.rds"))

We can use the same parameters that we used for the other population

edges_max <- 100
edges_min <- 20

We can test the number of edges in increments of 10 and use the default phi=2

pdf(file = here("output", "ldna", "pop", "chr1", paste(edges_min, "1.AUT_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim= 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr1", "summary_AUT.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr1", "summary_AUT.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off()

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)

Save it

max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr1", "AUT_clusters_snps.rds"))
head(df)
##          486_1     984_0.92    1003_0.92    1009_0.92    1059_0.91     1124_0.9
## 1 AX-583082355 AX-583077615 AX-583082729 AX-583082540 AX-583143595 AX-583079493
## 2 AX-583080311 AX-583084229 AX-583084494 AX-583081296 AX-583179339 AX-583080039
## 3 AX-583267474 AX-583086702 AX-583092423 AX-583083191 AX-583492416 AX-583090484
## 4 AX-583271270 AX-583084981 AX-583097183 AX-583089867 AX-583662898 AX-583090510
## 5 AX-583287170 AX-583090266 AX-583125396 AX-583088098 AX-583666928 AX-583090528
## 6 AX-583316303 AX-583088219 AX-583125411 AX-583098314 AX-583669504 AX-583090539
##       1125_0.9     1170_0.9    1194_0.89    1196_0.89    1197_0.89    1199_0.89
## 1 AX-583077744 AX-583083945 AX-583104990 AX-583230275 AX-583081551 AX-583406675
## 2 AX-583084344 AX-583103603 AX-583105859 AX-583258440 AX-583085369 AX-583408737
## 3 AX-583082465 AX-583109085 AX-583124119 AX-583258576 AX-583102519 AX-583408922
## 4 AX-583083567 AX-583112775 AX-583125662 AX-583261632 AX-583105051 AX-583411931
## 5 AX-583084106 AX-583118564 AX-583188409 AX-583268906 AX-583106473 AX-583412088
## 6 AX-583084830 AX-583128119 AX-583193018 AX-583270923 AX-583114670 AX-583411506
##      1256_0.89    1259_0.89    1270_0.88    1305_0.88    1306_0.88    1322_0.88
## 1 AX-583092207 AX-583114005 AX-583224188 AX-583082457 AX-583105324 AX-583743892
## 2 AX-583109098 AX-583132712 AX-583225176 AX-583082520 AX-583169677 AX-583743894
## 3 AX-583214430 AX-583141582 AX-583226294 AX-583085036 AX-583177110 AX-583743987
## 4 AX-583217670 AX-583140182 AX-583251357 AX-583089980 AX-583221558 AX-583748192
## 5 AX-583231706 AX-583189051 AX-583250607 AX-583094202 AX-583232570 AX-583744447
## 6 AX-583719508 AX-583647602 AX-583257947 AX-583107690 AX-583755982 AX-583748419
##      1338_0.87    1341_0.87    1354_0.87    1374_0.86    1375_0.86    1379_0.86
## 1 AX-583093153 AX-583079692 AX-583290627 AX-583080693 AX-583079818 AX-583080007
## 2 AX-583095200 AX-583085669 AX-583290947 AX-583081757 AX-583080825 AX-583078552
## 3 AX-583100356 AX-583093859 AX-583346031 AX-583084779 AX-583084748 AX-583091979
## 4 AX-583102561 AX-583100893 AX-583358300 AX-583085403 AX-583086801 AX-583094398
## 5 AX-583103031 AX-583121226 AX-583361570 AX-583085757 AX-583085068 AX-583099670
## 6 AX-583103360 AX-583141203 AX-583370483 AX-583086342 AX-583087534 AX-583111048
##      1382_0.86    1394_0.86    1433_0.85    1439_0.85    1443_0.85    1444_0.85
## 1 AX-583090060 AX-583078733 AX-583421828 AX-583088401 AX-583091311 AX-583079098
## 2 AX-583088031 AX-583090182 AX-583424368 AX-583088409 AX-583095810 AX-583080228
## 3 AX-583088088 AX-583091824 AX-583427162 AX-583090530 AX-583099952 AX-583082490
## 4 AX-583090839 AX-583091844 AX-583438773 AX-583091991 AX-583098185 AX-583100329
## 5 AX-583092153 AX-583091088 AX-583437059 AX-583091075 AX-583126425 AX-583102890
## 6 AX-583095768 AX-583109671 AX-583437659 AX-583091264 AX-583128031 AX-583103403
##      1445_0.85    1490_0.84    1493_0.84    1497_0.84    1501_0.84    1503_0.84
## 1 AX-583357477 AX-583079834 AX-583078623 AX-583288144 AX-583058970 AX-583090252
## 2 AX-583362419 AX-583080058 AX-583080012 AX-583288229 AX-583062020 AX-583090681
## 3 AX-583365782 AX-583078046 AX-583082994 AX-583290392 AX-583079805 AX-583092936
## 4 AX-583366413 AX-583078360 AX-583086832 AX-583288633 AX-583080669 AX-583100257
## 5 AX-583367374 AX-583078630 AX-583084951 AX-583298090 AX-583087492 AX-583106481
## 6 AX-583365259 AX-583079173 AX-583088361 AX-583310020 AX-583089803 AX-583108373
##      1504_0.84    1510_0.84    1528_0.84    1537_0.83    1542_0.83    1559_0.83
## 1 AX-583090103 AX-583092085 AX-583061190 AX-583093324 AX-583102140 AX-583206245
## 2 AX-583088566 AX-583090755 AX-583060679 AX-583098533 AX-583114039 AX-583206573
## 3 AX-583123346 AX-583094370 AX-583064768 AX-583106307 AX-583119103 AX-583206032
## 4 AX-583132283 AX-583092805 AX-583236374 AX-583132019 AX-583124320 AX-583206628
## 5 AX-583146162 AX-583098426 AX-583348752 AX-583158766 AX-583125762 AX-583211164
## 6 AX-583159653 AX-583102108 AX-583349435 AX-583184116 AX-583142794 AX-583216104
##      1582_0.82    1588_0.82    1596_0.82    1597_0.82     1641_0.8     1645_0.8
## 1 AX-583082986 AX-583081545 AX-583418007 AX-583120153 AX-583285218 AX-583513533
## 2 AX-583085059 AX-583082498 AX-583418312 AX-583122106 AX-583356838 AX-583518586
## 3 AX-583086595 AX-583081815 AX-583420741 AX-583143646 AX-583359627 AX-583516029
## 4 AX-583093000 AX-583084263 AX-583421653 AX-583143744 AX-583363009 AX-583518994
## 5 AX-583093225 AX-583082199 AX-583425633 AX-583217478 AX-583365683 AX-583516369
## 6 AX-583103385 AX-583082356 AX-583423540 AX-583217585 AX-583363765 AX-583519741
##       1671_0.8    1677_0.79    1692_0.79    1693_0.79    1706_0.78    1708_0.78
## 1 AX-583347603 AX-583079283 AX-583106208 AX-583357017 AX-583346466 AX-583299620
## 2 AX-583591150 AX-583088766 AX-583108181 AX-583357825 AX-583354455 AX-583309007
## 3 AX-583604882 AX-583101763 AX-583215251 AX-583358478 AX-583357388 AX-583313969
## 4 AX-583610820 AX-583101880 AX-583231607 AX-583359119 AX-583357406 AX-583320534
## 5 AX-583612670 AX-583104114 AX-583647971 AX-583362478 AX-583355463 AX-583323403
## 6 AX-583614272 AX-583104130 AX-583689608 AX-583361185 AX-583356029 AX-583325321
##      1727_0.77    1777_0.76    1781_0.75    1796_0.74    1844_0.71     1876_0.7
## 1 AX-583239905 AX-583296421 AX-583297700 AX-583356612 AX-583044062 AX-583535984
## 2 AX-583246295 AX-583296765 AX-583307254 AX-583354589 AX-583045286 AX-583539059
## 3 AX-583247473 AX-583299003 AX-583311460 AX-583356907 AX-583049863 AX-583539673
## 4 AX-583252383 AX-583297006 AX-583318665 AX-583357471 AX-583050852 AX-583541426
## 5 AX-583254434 AX-583299271 AX-583318939 AX-583358435 AX-583052748 AX-583541650
## 6 AX-583254659 AX-583297247 AX-583321304 AX-583362784 AX-583052782 AX-583539120
##      1915_0.66    1930_0.65    1937_0.64    1942_0.64    1948_0.63    1950_0.63
## 1 AX-583045158 AX-583539468 AX-583229152 AX-583500511 AX-583586175 AX-583656116
## 2 AX-583046117 AX-583539627 AX-583230001 AX-583500520 AX-583743824 AX-583892406
## 3 AX-583052719 AX-583542359 AX-583230062 AX-583503153 AX-583743998 AX-583888304
## 4 AX-583052916 AX-583542421 AX-583233057 AX-583500379 AX-583747965 AX-583892527
## 5 AX-583054862 AX-583542885 AX-583595717 AX-583503258 AX-583748013 AX-583892551
## 6 AX-583056500 AX-583542929 AX-583604842 AX-583504302 AX-583748046 AX-583893585
##      1952_0.63     1972_0.6     1973_0.6
## 1 AX-583058478 AX-583035257 AX-583535944
## 2 AX-583057722 AX-583036983 AX-583533335
## 3 AX-583058629 AX-583035342 AX-583536569
## 4 AX-583060489 AX-583035538 AX-583536002
## 5 AX-583350116 AX-583035671 AX-583537293
## 6 AX-583350843 AX-583036374 AX-583541544

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##   Cluster          SNP
## 1   486_1 AX-583082355
## 2   486_1 AX-583080311
## 3   486_1 AX-583267474
## 4   486_1 AX-583271270
## 5   486_1 AX-583287170
## 6   486_1 AX-583316303
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr1", "AUT_clusters_snps1.rds"))

Make long format

aut_chr1 <- melt(do, na.rm = T, value.name = "value")
head(aut_chr1)
##   Var1  Var2        value
## 1    1 486_1 AX-583082355
## 2    2 486_1 AX-583080311
## 3    3 486_1 AX-583267474
## 4    4 486_1 AX-583271270
## 5    5 486_1 AX-583287170
## 6    6 486_1 AX-583316303

Update names

colnames(aut_chr1)<- c("v1", "cluster", "SNP")
head(aut_chr1)
##   v1 cluster          SNP
## 1  1   486_1 AX-583082355
## 2  2   486_1 AX-583080311
## 3  3   486_1 AX-583267474
## 4  4   486_1 AX-583271270
## 5  5   486_1 AX-583287170
## 6  6   486_1 AX-583316303

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
aut_snps_chr1 <- import_bim(here("output", "ldna", "pop", "AUT.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(aut_snps_chr1)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(aut_chr1, aut_snps_chr1, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP v1  cluster Chromosome Position
## 1 AX-583035257  1 1972_0.6          1   442875
## 2 AX-583035342  3 1972_0.6          1  1278179
## 3 AX-583035538  4 1972_0.6          1  1283423
## 4 AX-583035671  5 1972_0.6          1  1309469
## 5 AX-583036374  6 1972_0.6          1  1353955
## 6 AX-583036705  7 1972_0.6          1  1380337

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##    cluster          SNP Position
## 1 1972_0.6 AX-583035257   442875
## 2 1972_0.6 AX-583035342  1278179
## 3 1972_0.6 AX-583035538  1283423
## 4 1972_0.6 AX-583035671  1309469
## 5 1972_0.6 AX-583036374  1353955
## 6 1972_0.6 AX-583036705  1380337

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##    cluster          SNP Position
## 1 1972_0.6 AX-583035257   442875
## 7 1972_0.6 AX-583036983  1276832
## 2 1972_0.6 AX-583035342  1278179
## 3 1972_0.6 AX-583035538  1283423
## 4 1972_0.6 AX-583035671  1309469
## 5 1972_0.6 AX-583036374  1353955
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##    cluster Position
## 1 1972_0.6   442875
## 7 1972_0.6  1276832
## 2 1972_0.6  1278179
## 3 1972_0.6  1283423
## 4 1972_0.6  1309469
## 5 1972_0.6  1353955

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr1", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr1", "SNPs_clusters_AUT_chr1.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr1", "clusters", "sushi_AUT_chr1.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr1/clusters/sushi_AUT_chr1.txt > output/ldna/pop/chr1/clusters/sushi2_AUT_chr1.txt;
head output/ldna/pop/chr1/clusters/sushi2_AUT_chr1.txt
## Cluster  Start   End Size
## 1972_0.6 442875  2339987 1897112
## 1844_0.71    2371768 2371768 0
## 1972_0.6 2449859 2449859 0
## 1915_0.66    2663084 2663084 0
## 1844_0.71    2675644 2675644 0
## 1972_0.6 2730766 2814745 83979
## 1915_0.66    2816385 2816385 0
## 1972_0.6 2935811 3536906 601095
## 1844_0.71    3549874 3549874 0

Get SNP count

# Define the path
input_path <- here("output", "ldna", "pop", "chr1", "clusters", "sushi2_AUT_chr1.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')

# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 1003_0.92        43
## 2 1009_0.92        76
## 3 1059_0.91        34
## 4 1124_0.9        211
## 5 1125_0.9        306
## 6 1170_0.9         87

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr1",
       "clusters",
       "sushi2_AUT_chr1.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 1 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(ncol = 7, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
# ggsave(
#   filename = here("output", "ldna", "pop", "chr1", "clusters", "AUT_chr1.pdf"),
#   device = "pdf",
#   width = 8,
#   height = 5,
#   units = "in"
# )

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 1 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 6, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr1", "clusters", "AUT_chr1.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments    Start      End    Size  Start_Mb
## 1          1  1972_0.6  0.6        20   442875  2339987 1897112  0.442875
## 2          1  1972_0.6  0.6        20  2935811  3536906  601095  2.935811
## 3          1 1844_0.71 0.71        20  3766945  4238554  471609  3.766945
## 4          1 1915_0.66 0.66        23  4981859  5217622  235763  4.981859
## 5          1 1844_0.71 0.71        20  5443736 10957198 5513462  5.443736
## 6          1 1952_0.63 0.63        20 11125830 11276238  150408 11.125830
##      End_Mb
## 1  2.339987
## 2  3.536906
## 3  4.238554
## 4  5.217622
## 5 10.957198
## 6 11.276238
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr1", "AUT_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr1",
       "summary_AUT.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(1, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("AUT", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs
## 1          1  1972_0.6  0.6   442875  2339987        20   124
## 2          1  1972_0.6  0.6  2935811  3536906        20   124
## 3          1 1844_0.71 0.71  3766945  4238554        20    45
## 4          1 1915_0.66 0.66  4981859  5217622        23    30
## 5          1 1844_0.71 0.71  5443736 10957198        20    45
## 6          1 1952_0.63 0.63 11125830 11276238        20    31
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr1", "AUT_plot2.rds"))

4. LDna chromosome 2

Because MAN has only 10 mosquitoes we will not estimate LD for it

4.2 NEW

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells 2736156 146.2    4907427  262.1         NA   4907427  262.1
## Vcells 4883919  37.3  260771499 1989.6      32768 339462734 2589.9

Import the data (make sure you have enough memory)

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr2", "NEW.chr2.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=4, method = "single")

Save ldna

saveRDS(ldna, file = here("output", "ldna", "pop", "chr2", "NEW.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr2", "NEW_ld2.rds"))

Import the data

ldna <- readRDS(here("output", "ldna", "pop", "chr2", "NEW.rds"))
ld2 <- readRDS(here("output", "ldna", "pop", "chr2", "NEW_ld2.rds"))

Check the object

str(ldna)
## List of 4
##  $ clusterfile: logi [1:13288, 1:4574] TRUE TRUE TRUE TRUE TRUE TRUE ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:13288] "AX-584286863" "AX-584290749" "AX-584288500" "AX-584288604" ...
##   .. ..$ : chr [1:4574] "4574_0.37" "4573_0.43" "4572_0.44" "4571_0.45" ...
##  $ stats      :'data.frame': 17862 obs. of  6 variables:
##   ..$ cluster       : chr [1:17862] "4574_0.37" "4573_0.43" "4572_0.44" "4571_0.45" ...
##   ..$ parent_cluster: chr [1:17862] "root" "4574_0.37" "4573_0.43" "4572_0.44" ...
##   ..$ distance      : chr [1:17862] "0.37" "0.06" "0.01" "0.01" ...
##   ..$ nV            : num [1:17862] 13288 13287 13286 13285 13284 ...
##   ..$ nE            : num [1:17862] 2560790 1335282 1335282 1209227 1112425 ...
##   ..$ lambda        : num [1:17862] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tree       :List of 4
##   ..$ edge       : int [1:17861, 1:2] 13289 13289 13290 13290 13291 13291 13292 13292 13293 13293 ...
##   ..$ edge.length: num [1:17861] 0.315 0.03 0.285 0.005 0.28 0.005 0.275 0.005 0.27 0.27 ...
##   ..$ tip.label  : chr [1:13288] "AX-584286863" "AX-584290749" "AX-584288500" "AX-584288604" ...
##   ..$ Nnode      : int 4574
##   ..- attr(*, "class")= chr "phylo"
##   ..- attr(*, "order")= chr "cladewise"
##  $ lambda_min :Classes 'data.table' and 'data.frame':    4573 obs. of  2 variables:
##   ..$ V1: num [1:4573] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
##   ..$ V2: num [1:4573] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>

We can use the same parameters that we used for the other population

edges_max <- 100
edges_min <- 20

We can test the number of edges in increments of 10 and use the default phi=2.

pdf(file = here("output", "ldna", "pop", "chr2", paste(edges_min, "2.NEW_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim= 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr2", "summary_NEW.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr2", "summary_NEW.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off()

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)

Save it

max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr2", "AUT_clusters_snps.rds"))
head(df)
##      2028_0.87    2384_0.85    2457_0.85    2543_0.84    2544_0.84    2545_0.84
## 1 AX-584364545 AX-584296845 AX-584386160 AX-584291291 AX-584288500 AX-584292213
## 2 AX-584369094 AX-584305876 AX-584396189 AX-584298198 AX-584290629 AX-584292851
## 3 AX-584381578 AX-584303903 AX-584396224 AX-584301326 AX-584296659 AX-584295327
## 4 AX-584381638 AX-584307353 AX-584398629 AX-584299194 AX-584297734 AX-584298318
## 5 AX-584400708 AX-584308131 AX-584396883 AX-584305293 AX-584297020 AX-584307281
## 6 AX-584467812 AX-584308905 AX-584399251 AX-584303566 AX-584300137 AX-584308135
##      2608_0.84    2624_0.84    2713_0.83    2748_0.83    2944_0.82    2995_0.82
## 1 AX-584297676 AX-584354221 AX-584295694 AX-584294474 AX-584298215 AX-582438087
## 2 AX-584306598 AX-584356256 AX-584334344 AX-584294917 AX-584323661 AX-582529253
## 3 AX-584317413 AX-584515581 AX-584419498 AX-584306577 AX-584324667 AX-579462195
## 4 AX-584320188 AX-584638544 AX-584420126 AX-584308199 AX-584327659 AX-579462236
## 5 AX-584321867 AX-585147319 AX-584422213 AX-584319297 AX-584344677 AX-579463414
## 6 AX-584384738 AX-585147377 AX-585032328 AX-584321828 AX-584359719 AX-579463494
##      3119_0.81     3264_0.8    3426_0.79    3951_0.75
## 1 AX-584309060 AX-584296631 AX-584292420 AX-584515491
## 2 AX-584322622 AX-584296215 AX-584300563 AX-584568661
## 3 AX-584327466 AX-584298922 AX-584301384 AX-584575386
## 4 AX-584338229 AX-584302721 AX-584302194 AX-585084194
## 5 AX-584346954 AX-584302820 AX-584305790 AX-585084217
## 6 AX-584351026 AX-584301000 AX-584303568 AX-585088305

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##     Cluster          SNP
## 1 2028_0.87 AX-584364545
## 2 2028_0.87 AX-584369094
## 3 2028_0.87 AX-584381578
## 4 2028_0.87 AX-584381638
## 5 2028_0.87 AX-584400708
## 6 2028_0.87 AX-584467812
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr2", "AUT_clusters_snps1.rds"))

Make long format

new_chr2 <- melt(do, na.rm = T, value.name = "value")
head(new_chr2)
##   Var1      Var2        value
## 1    1 2028_0.87 AX-584364545
## 2    2 2028_0.87 AX-584369094
## 3    3 2028_0.87 AX-584381578
## 4    4 2028_0.87 AX-584381638
## 5    5 2028_0.87 AX-584400708
## 6    6 2028_0.87 AX-584467812

Update names

colnames(new_chr2)<- c("v1", "cluster", "SNP")
head(new_chr2)
##   v1   cluster          SNP
## 1  1 2028_0.87 AX-584364545
## 2  2 2028_0.87 AX-584369094
## 3  3 2028_0.87 AX-584381578
## 4  4 2028_0.87 AX-584381638
## 5  5 2028_0.87 AX-584400708
## 6  6 2028_0.87 AX-584467812

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
new_snps_chr2 <- import_bim(here("output", "ldna", "pop", "NEW.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(new_snps_chr2)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(new_chr2, new_snps_chr2, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP  v1   cluster Chromosome  Position
## 1 AX-579439570 195 2543_0.84          2 360434528
## 2 AX-579439703  48 2384_0.85          2 360461064
## 3 AX-579440157 146 3426_0.79          2 360617545
## 4 AX-579440159 147 3426_0.79          2 360617794
## 5 AX-579440342  27 2544_0.84          2 360683864
## 6 AX-579442008  28 2544_0.84          2 361118184

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##     cluster          SNP  Position
## 1 2543_0.84 AX-579439570 360434528
## 2 2384_0.85 AX-579439703 360461064
## 3 3426_0.79 AX-579440157 360617545
## 4 3426_0.79 AX-579440159 360617794
## 5 2544_0.84 AX-579440342 360683864
## 6 2544_0.84 AX-579442008 361118184

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##        cluster          SNP Position
## 1416 2544_0.84 AX-584288500  2136706
## 1418 2543_0.84 AX-584291291  2223190
## 1419 2545_0.84 AX-584292213  2320220
## 1420 3426_0.79 AX-584292420  2339290
## 1421 2545_0.84 AX-584292851  2376034
## 1417 2544_0.84 AX-584290629  2381422
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##        cluster Position
## 1416 2544_0.84  2136706
## 1418 2543_0.84  2223190
## 1419 2545_0.84  2320220
## 1420 3426_0.79  2339290
## 1421 2545_0.84  2376034
## 1417 2544_0.84  2381422

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr2", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr2", "SNPs_clusters_NEW_chr2.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr2", "clusters", "sushi_NEW_chr2.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr2/clusters/sushi_NEW_chr2.txt > output/ldna/pop/chr2/clusters/sushi2_NEW_chr2.txt;
head output/ldna/pop/chr2/clusters/sushi2_NEW_chr2.txt
## Cluster  Start   End Size
## 2544_0.84    2136706 2136706 0
## 2543_0.84    2223190 2223190 0
## 2545_0.84    2320220 2320220 0
## 3426_0.79    2339290 2339290 0
## 2545_0.84    2376034 2376034 0
## 2544_0.84    2381422 2381422 0
## 3264_0.8 2883089 2883089 0
## 2544_0.84    2883824 2883824 0
## 2748_0.83    2928346 3035811 107465
# Define the path
input_path <- here("output", "ldna", "pop", "chr2", "clusters", "sushi2_NEW_chr2.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)


# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')

# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 2028_0.87        25
## 2 2384_0.85       102
## 3 2457_0.85        27
## 4 2543_0.84       372
## 5 2544_0.84        52
## 6 2545_0.84       351

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr2",
       "clusters",
       "sushi2_NEW_chr2.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 2 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 3, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
# ggsave(
#   filename = here("output", "ldna", "pop", "chr2", "clusters", "NEW_chr2.pdf"),
#   device = "pdf",
#   width = 8,
#   height = 5,
#   units = "in"
# )

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 2 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 3, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr2", "clusters", "NEW_chr2.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

Check how many SNPs per cluster

# Count non-NA values in each column
non_na_counts <- colSums(!is.na(df))

# View the counts
non_na_counts
## 2028_0.87 2384_0.85 2457_0.85 2543_0.84 2544_0.84 2545_0.84 2608_0.84 2624_0.84 
##        35       112        39       674        79       633        48        37 
## 2713_0.83 2748_0.83 2944_0.82 2995_0.82 3119_0.81  3264_0.8 3426_0.79 3951_0.75 
##        39       208        63        39        54        55       326        62

Check if it matches with the output from LDna

# Replace 'path_to_file.txt' with the actual path to your text file
snp_data <- read.table(here("output", "ldna", "pop", "chr2", "summary_NEW.txt"), header = TRUE, sep = "\t")

# View the first few rows of the data
head(snp_data)
##        Name Type Merge.at nLoci   nE    lambda Median.LD MAD.LD
## 1 2028_0.87  SOC     0.84    35  116  1.475015     0.612  0.150
## 2 2384_0.85  SOC     0.84   112  464  1.770018     0.398  0.166
## 3 2457_0.85  SOC     0.82    39  131  1.438140     0.680  0.119
## 4 2543_0.84  SOC     0.83   674 4979 11.666165     0.458  0.136
## 5 2544_0.84  SOC     0.83    79  233  2.675346     0.600  0.118
## 6 2545_0.84  SOC     0.83   633 2573  7.621915     0.348  0.147

It does match.

We can save the data to plot all chromosomes together using a facet plot

head(annotated_data)
##     Cluster     Start       End   Size  Start_Mb    End_Mb nSegments
## 1 2384_0.85 572618301 572720232 101931 572.61830 572.72023       102
## 2 2384_0.85 121846729 122778671 931942 121.84673 122.77867       102
## 3 2384_0.85 529595105 529716047 120942 529.59510 529.71605       102
## 4 2384_0.85 557464055 557799648 335593 557.46406 557.79965       102
## 5 2457_0.85  59513921  60031162 517241  59.51392  60.03116        27
## 6 2457_0.85  60109082  60294212 185130  60.10908  60.29421        27
##   Cluster_with_SNPs
## 1   2384_0.85 (102)
## 2   2384_0.85 (102)
## 3   2384_0.85 (102)
## 4   2384_0.85 (102)
## 5    2457_0.85 (27)
## 6    2457_0.85 (27)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments    Start      End   Size  Start_Mb
## 1          1 2748_0.83 0.83       161  2928346  3035811 107465  2.928346
## 2          1 2543_0.84 0.84       372  6886348  7055166 168818  6.886348
## 3          1 2543_0.84 0.84       372  7248250  7523031 274781  7.248250
## 4          1 2545_0.84 0.84       351  8122368  8243905 121537  8.122368
## 5          1 2543_0.84 0.84       372  8295100  8538180 243080  8.295100
## 6          1 2543_0.84 0.84       372 10781210 10897176 115966 10.781210
##      End_Mb
## 1  3.035811
## 2  7.055166
## 3  7.523031
## 4  8.243905
## 5  8.538180
## 6 10.897176
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr2", "NEW_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr2",
       "summary_NEW.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(2, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("NEW", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs
## 1          2 2748_0.83 0.83  2928346  3035811       161   208
## 2          2 2543_0.84 0.84  6886348  7055166       372   674
## 3          2 2543_0.84 0.84  7248250  7523031       372   674
## 4          2 2545_0.84 0.84  8122368  8243905       351   633
## 5          2 2543_0.84 0.84  8295100  8538180       372   674
## 6          2 2543_0.84 0.84 10781210 10897176       372   674
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr2", "NEW_plot2.rds"))

3.4 AUT

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()

Import the data

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr2", "AUT.chr2.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=6, method = "single")

Check the object

str(ldna)

Save it

saveRDS(ldna, file = here("output", "ldna", "pop", "chr2", "AUT.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr2", "AUT_ld2.rds"))

Load data

ldna <- readRDS(here("output", "ldna", "pop", "chr2", "AUT.rds"))
ld2 <- readRDS(here("output", "ldna", "pop", "chr2", "AUT_ld2.rds"))

We can use the same parameters that we used for the other population

# snp_count <- ncol(ld2)
edges_max <- 100
edges_min <- 20

We can test the number of edges in increments of 10 and use the default phi=2

pdf(file = here("output", "ldna", "pop", "chr2", paste(edges_min, "2.AUT_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim= 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr2", "summary_AUT.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr2", "summary_AUT.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off()

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)
max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr2", "AUT_clusters_snps.rds"))
head(df)
##      1358_0.96    1758_0.92    1821_0.92    1837_0.92    1857_0.92    1868_0.92
## 1 AX-584306626 AX-584293484 AX-584304195 AX-584301245 AX-584311262 AX-584294045
## 2 AX-584310382 AX-584295327 AX-584346777 AX-584305931 AX-584311750 AX-584301847
## 3 AX-584312050 AX-584298215 AX-584384807 AX-584315163 AX-584317022 AX-584312682
## 4 AX-584316395 AX-584299503 AX-584636999 AX-584321009 AX-584320236 AX-584311672
## 5 AX-584317020 AX-584306232 AX-579791141 AX-584323656 AX-584323515 AX-584311957
## 6 AX-584317736 AX-584303951 AX-579807464 AX-584329379 AX-584329483 AX-584323796
##      1975_0.91    1988_0.91    2002_0.91     2105_0.9     2125_0.9     2158_0.9
## 1 AX-584300351 AX-585104529 AX-584306358 AX-584293814 AX-584305293 AX-584315580
## 2 AX-584330244 AX-584595865 AX-584306361 AX-584297718 AX-584305624 AX-584328040
## 3 AX-584344116 AX-585104739 AX-584308161 AX-584314308 AX-584308199 AX-584339290
## 4 AX-584372660 AX-584600991 AX-584325887 AX-584329799 AX-584323840 AX-584360344
## 5 AX-584410215 AX-585110237 AX-584329230 AX-584330131 AX-584328642 AX-584360636
## 6 AX-584425454 AX-584604851 AX-584337453 AX-584331488 AX-584358240 AX-584385502
##      2177_0.89    2178_0.89    2179_0.89    2180_0.89    2182_0.89    2183_0.89
## 1 AX-584288666 AX-584292851 AX-584300876 AX-584291260 AX-584291291 AX-584293019
## 2 AX-584289002 AX-584298948 AX-584298669 AX-584292420 AX-584295694 AX-584291278
## 3 AX-584294474 AX-584301919 AX-584300321 AX-584294917 AX-584298922 AX-584295461
## 4 AX-584299568 AX-584299606 AX-584300385 AX-584296215 AX-584303999 AX-584299881
## 5 AX-584300543 AX-584305861 AX-584307397 AX-584305903 AX-584305790 AX-584301066
## 6 AX-584300616 AX-584306567 AX-584307916 AX-584308617 AX-584303566 AX-584298963
##      2185_0.89    2259_0.89    2260_0.89    2261_0.89    2293_0.89    2308_0.89
## 1 AX-584298225 AX-584296944 AX-584292213 AX-584294476 AX-584298361 AX-584659351
## 2 AX-584301153 AX-584300563 AX-584296631 AX-584305876 AX-584303143 AX-584659447
## 3 AX-584325005 AX-584301326 AX-584297676 AX-584307281 AX-584304565 AX-584659545
## 4 AX-584343433 AX-584305114 AX-584301502 AX-584307353 AX-584306660 AX-585168307
## 5 AX-584354787 AX-584310620 AX-584305421 AX-584308745 AX-584312672 AX-585168630
## 6 AX-584358368 AX-584361121 AX-584309904 AX-584321397 AX-584328187 AX-584668184
##      2353_0.88    2394_0.88    2408_0.88    2472_0.87    2485_0.87    2530_0.87
## 1 AX-584289640 AX-584598345 AX-579547581 AX-584658336 AX-584297242 AX-585198965
## 2 AX-584298198 AX-584600906 AX-579547627 AX-585167919 AX-584299395 AX-584691314
## 3 AX-584300399 AX-584604786 AX-579676317 AX-584662405 AX-584299741 AX-584691329
## 4 AX-584303568 AX-584605376 AX-579678007 AX-582607287 AX-584302979 AX-584691600
## 5 AX-584306269 AX-584605418 AX-579678792 AX-582617763 AX-584306813 AX-584692092
## 6 AX-584307198 AX-584605784 AX-579679017 AX-582622634 AX-584306576 AX-584692148
##      2532_0.86    2535_0.86    2693_0.85    2733_0.85    2743_0.85    2757_0.84
## 1 AX-584290749 AX-579702593 AX-584318708 AX-582609574 AX-582603578 AX-584294717
## 2 AX-584289319 AX-579703208 AX-584347785 AX-582609695 AX-582611436 AX-584298836
## 3 AX-584299680 AX-579703990 AX-584385390 AX-582625349 AX-582625257 AX-584299293
## 4 AX-584300309 AX-579705846 AX-584534601 AX-582629579 AX-582635519 AX-584308179
## 5 AX-584308218 AX-579704453 AX-585105623 AX-582630620 AX-582641947 AX-584310683
## 6 AX-584322051 AX-579706712 AX-585143687 AX-582631805 AX-582641957 AX-584311218
##      2761_0.84    2762_0.84    2792_0.84    2812_0.84    2827_0.84    2840_0.83
## 1 AX-584593188 AX-585314120 AX-582618714 AX-585162205 AX-585241182 AX-584324541
## 2 AX-585101931 AX-584805636 AX-585228013 AX-584658365 AX-585270603 AX-584413742
## 3 AX-584626008 AX-584805651 AX-584719396 AX-584660002 AX-584761983 AX-584592203
## 4 AX-585134987 AX-585315018 AX-585229267 AX-584665788 AX-584761986 AX-585117444
## 5 AX-585148398 AX-584807440 AX-585229270 AX-584669363 AX-585270835 AX-585125916
## 6 AX-584640115 AX-584808100 AX-584721431 AX-582605942 AX-584762818 AX-584617377
##      2842_0.83    2876_0.83    2891_0.83    2892_0.83    2911_0.83    2913_0.83
## 1 AX-582612583 AX-582613274 AX-584292517 AX-585141669 AX-582524398 AX-579676926
## 2 AX-582618195 AX-582619159 AX-584299316 AX-584653323 AX-582525390 AX-579677573
## 3 AX-582619234 AX-582619871 AX-584298590 AX-585168028 AX-582525959 AX-579679990
## 4 AX-582625828 AX-582634345 AX-584299194 AX-585170223 AX-582526291 AX-579678683
## 5 AX-582631839 AX-582637808 AX-584301851 AX-582608090 AX-582526390 AX-579680330
## 6 AX-582643947 AX-585222154 AX-584308673 AX-585208490 AX-582527330 AX-579681479
##      2918_0.82    2927_0.82    2930_0.82    2967_0.82    2972_0.82    2982_0.82
## 1 AX-582602978 AX-585150359 AX-582627235 AX-582453732 AX-584393501 AX-582447006
## 2 AX-582604639 AX-585155926 AX-582627328 AX-582455584 AX-584437766 AX-582447946
## 3 AX-582604666 AX-584652514 AX-582626373 AX-582456078 AX-582565193 AX-582449264
## 4 AX-582608509 AX-584658607 AX-582645178 AX-582457649 AX-584912770 AX-582449894
## 5 AX-582608518 AX-584661534 AX-584700651 AX-582458541 AX-584913691 AX-582450493
## 6 AX-582613484 AX-582607708 AX-584702495 AX-582458981 AX-585453868 AX-582450499
##      2991_0.81    2998_0.81    3005_0.81    3006_0.81    3009_0.81     3020_0.8
## 1 AX-584288500 AX-584645493 AX-584298191 AX-584600807 AX-584689536 AX-584288604
## 2 AX-584288620 AX-585166935 AX-584303832 AX-585134800 AX-585198572 AX-584298018
## 3 AX-584289805 AX-585167771 AX-584306598 AX-585135668 AX-585201152 AX-584298270
## 4 AX-584290629 AX-584659102 AX-584310439 AX-585137251 AX-584692505 AX-584308860
## 5 AX-584293911 AX-585167796 AX-584311187 AX-584632208 AX-584692897 AX-584311586
## 6 AX-584294425 AX-584659454 AX-584313888 AX-584661807 AX-584692902 AX-584319604
##       3022_0.8     3030_0.8     3031_0.8     3043_0.8     3053_0.8     3063_0.8
## 1 AX-584639979 AX-584690378 AX-582528002 AX-584298943 AX-585134301 AX-579622196
## 2 AX-584641155 AX-585199040 AX-582528028 AX-584299411 AX-585152062 AX-579620625
## 3 AX-584642622 AX-584690675 AX-582528673 AX-584301843 AX-584643732 AX-579621347
## 4 AX-585151583 AX-584690737 AX-582528685 AX-584299567 AX-585164988 AX-579624053
## 5 AX-584647191 AX-584691233 AX-582528295 AX-584318724 AX-585168494 AX-579625864
## 6 AX-585157838 AX-584692547 AX-582529821 AX-584321810 AX-585178207 AX-579624609
##       3071_0.8     3077_0.8    3096_0.79    3097_0.79    3104_0.79    3112_0.79
## 1 AX-585138110 AX-585359327 AX-579445968 AX-579500176 AX-585135312 AX-585234700
## 2 AX-585138478 AX-579439473 AX-579450228 AX-579500738 AX-584627202 AX-584726097
## 3 AX-584629839 AX-579637200 AX-579449539 AX-579502409 AX-584629312 AX-584729510
## 4 AX-584629911 AX-579638885 AX-579449559 AX-579501201 AX-582527977 AX-585252750
## 5 AX-584631673 AX-579638960 AX-579449596 AX-579501306 AX-582528993 AX-585252990
## 6 AX-585145247 AX-579638979 AX-579449963 AX-579501546 AX-582529293 AX-584747691
##      3116_0.79    3129_0.78    3139_0.78    3140_0.78    3149_0.78    3165_0.77
## 1 AX-579446949 AX-584655936 AX-579450486 AX-584289546 AX-582630710 AX-584293965
## 2 AX-579446023 AX-585168648 AX-579456609 AX-584296845 AX-584704993 AX-584296764
## 3 AX-579449389 AX-584664909 AX-579459157 AX-584300275 AX-585213812 AX-584297734
## 4 AX-579448182 AX-584664936 AX-579459504 AX-584298246 AX-585224576 AX-584297628
## 5 AX-579450439 AX-584665009 AX-579459790 AX-584298882 AX-584719356 AX-584300380
## 6 AX-579451664 AX-585173827 AX-579461404 AX-584299945 AX-585228026 AX-584302108
##      3166_0.77    3208_0.76    3222_0.76    3227_0.76    3247_0.75    3248_0.75
## 1 AX-584774863 AX-582455201 AX-585438099 AX-584900262 AX-579642471 AX-585203715
## 2 AX-584788824 AX-582455981 AX-584929953 AX-584902930 AX-579642492 AX-584695652
## 3 AX-585300122 AX-582457206 AX-584930177 AX-585412052 AX-579642513 AX-584695875
## 4 AX-579674232 AX-582457258 AX-585438939 AX-584904005 AX-579642789 AX-582404449
## 5 AX-579676064 AX-582457745 AX-584930662 AX-585413454 AX-579641357 AX-582408424
## 6 AX-579683637 AX-582458554 AX-584930879 AX-584906285 AX-579641533 AX-582416983
##      3254_0.75    3263_0.74    3267_0.74    3270_0.74    3273_0.74    3277_0.74
## 1 AX-584862852 AX-585071703 AX-582505966 AX-585456097 AX-585193411 AX-582453594
## 2 AX-584862862 AX-579675236 AX-582511125 AX-584948765 AX-584684759 AX-582454026
## 3 AX-585374950 AX-579676841 AX-582511590 AX-584950738 AX-584685571 AX-582454042
## 4 AX-585376053 AX-579676365 AX-582514721 AX-584950844 AX-585194846 AX-582454521
## 5 AX-585376647 AX-579679605 AX-582522512 AX-584951613 AX-584686905 AX-582455526
## 6 AX-584868034 AX-579697462 AX-582523473 AX-585460294 AX-585195579 AX-582455796
##      3288_0.73    3299_0.73    3331_0.72    3342_0.72    3345_0.71     3366_0.7
## 1 AX-582564350 AX-584681146 AX-584805542 AX-585180502 AX-582512679 AX-585181492
## 2 AX-585275074 AX-585189918 AX-585314476 AX-584671895 AX-582515131 AX-585181563
## 3 AX-585419563 AX-584684105 AX-585314596 AX-585180829 AX-582519638 AX-585181623
## 4 AX-584911933 AX-585193592 AX-585314617 AX-584672267 AX-582522334 AX-584673367
## 5 AX-585420755 AX-584685120 AX-584806621 AX-585181124 AX-582527641 AX-585182067
## 6 AX-585421027 AX-584686105 AX-585316409 AX-585181236 AX-582529322 AX-585182068
##      3382_0.69    3394_0.68    3403_0.67    3409_0.67    3416_0.66    3422_0.65
## 1 AX-579677786 AX-585411064 AX-584889831 AX-584890550 AX-585179336 AX-585399153
## 2 AX-579681326 AX-585411092 AX-585398553 AX-585428101 AX-585179355 AX-584911160
## 3 AX-579683343 AX-584902516 AX-585399043 AX-585428436 AX-584670713 AX-584912804
## 4 AX-579683736 AX-584902620 AX-584891039 AX-585428445 AX-585179418 AX-584914201
## 5 AX-579682847 AX-585411587 AX-584891165 AX-584920293 AX-585179450 AX-584914385
## 6 AX-579689080 AX-584903232 AX-585400221 AX-585428982 AX-584670822 AX-585423647
##      3430_0.65    3439_0.64    3443_0.63    3446_0.63    3448_0.63    3452_0.62
## 1 AX-584671288 AX-585471166 AX-584766531 AX-584946419 AX-584766445 AX-584691876
## 2 AX-585179975 AX-584963480 AX-584941895 AX-585455535 AX-584929098 AX-584692549
## 3 AX-584671394 AX-584963744 AX-584942273 AX-585455736 AX-584933359 AX-585202320
## 4 AX-585180065 AX-584966027 AX-584943016 AX-585456654 AX-584933603 AX-585202374
## 5 AX-584671451 AX-585474705 AX-585452119 AX-585456852 AX-584935054 AX-585203555
## 6 AX-585180165 AX-584968166 AX-584946573 AX-584948607 AX-584935127 AX-582402737
##      3455_0.61     3457_0.6
## 1 AX-584917290 AX-585398780
## 2 AX-585426057 AX-584890320
## 3 AX-585426155 AX-585399008
## 4 AX-584917565 AX-584890476
## 5 AX-584918732 AX-584890511
## 6 AX-584918757 AX-585399117

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##     Cluster          SNP
## 1 1358_0.96 AX-584306626
## 2 1358_0.96 AX-584310382
## 3 1358_0.96 AX-584312050
## 4 1358_0.96 AX-584316395
## 5 1358_0.96 AX-584317020
## 6 1358_0.96 AX-584317736
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr2", "AUT_clusters_snps1.rds"))

Make long format

aut_chr2 <- melt(do, na.rm = T, value.name = "value")
head(aut_chr2)
##   Var1      Var2        value
## 1    1 1358_0.96 AX-584306626
## 2    2 1358_0.96 AX-584310382
## 3    3 1358_0.96 AX-584312050
## 4    4 1358_0.96 AX-584316395
## 5    5 1358_0.96 AX-584317020
## 6    6 1358_0.96 AX-584317736

Update names

colnames(aut_chr2)<- c("v1", "cluster", "SNP")
head(aut_chr2)
##   v1   cluster          SNP
## 1  1 1358_0.96 AX-584306626
## 2  2 1358_0.96 AX-584310382
## 3  3 1358_0.96 AX-584312050
## 4  4 1358_0.96 AX-584316395
## 5  5 1358_0.96 AX-584317020
## 6  6 1358_0.96 AX-584317736

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
aut_snps_chr <- import_bim(here("output", "ldna", "pop", "AUT.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(aut_snps_chr)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(aut_chr2, aut_snps_chr, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP v1   cluster Chromosome  Position
## 1 AX-579436348 52 3439_0.64          2 359895186
## 2 AX-579436431 54 3439_0.64          2 359897330
## 3 AX-579436724 57 3439_0.64          2 359914399
## 4 AX-579436929 59 3439_0.64          2 359917288
## 5 AX-579436978 60 3439_0.64          2 359918537
## 6 AX-579437055 61 3439_0.64          2 359920163

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##     cluster          SNP  Position
## 1 3439_0.64 AX-579436348 359895186
## 2 3439_0.64 AX-579436431 359897330
## 3 3439_0.64 AX-579436724 359914399
## 4 3439_0.64 AX-579436929 359917288
## 5 3439_0.64 AX-579436978 359918537
## 6 3439_0.64 AX-579437055 359920163

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##        cluster          SNP Position
## 5253 2532_0.86 AX-584290749  2136162
## 5243 2991_0.81 AX-584288500  2136706
## 5244  3020_0.8 AX-584288604  2163818
## 5245 2991_0.81 AX-584288620  2164064
## 5246 2177_0.89 AX-584288666  2172646
## 5254 2180_0.89 AX-584291260  2222481
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##        cluster Position
## 5253 2532_0.86  2136162
## 5243 2991_0.81  2136706
## 5244  3020_0.8  2163818
## 5245 2991_0.81  2164064
## 5246 2177_0.89  2172646
## 5254 2180_0.89  2222481

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr2", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr2", "SNPs_clusters_AUT_chr2.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr2", "clusters", "sushi_AUT_chr2.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr2/clusters/sushi_AUT_chr2.txt > output/ldna/pop/chr2/clusters/sushi2_AUT_chr2.txt;
head output/ldna/pop/chr2/clusters/sushi2_AUT_chr2.txt
## Cluster  Start   End Size
## 2532_0.86    2136162 2136162 0
## 2991_0.81    2136706 2136706 0
## 3020_0.8 2163818 2163818 0
## 2991_0.81    2164064 2164064 0
## 2177_0.89    2172646 2172646 0
## 2180_0.89    2222481 2222481 0
## 2177_0.89    2222896 2222896 0
## 2182_0.89    2223190 2223190 0
## 2532_0.86    2255745 2255745 0

Get SNP count

# Define the path
input_path <- here("output", "ldna", "pop", "chr2", "clusters", "sushi2_AUT_chr2.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')

# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 1358_0.96       108
## 2 1758_0.92       253
## 3 1821_0.92        29
## 4 1837_0.92       123
## 5 1857_0.92        60
## 6 1868_0.92       146

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr2",
       "clusters",
       "sushi2_AUT_chr2.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 2 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(ncol = 7, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr2", "clusters", "AUT_chr2.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 2 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 11, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr2", "clusters", "AUT_chr2b.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments    Start      End   Size Start_Mb
## 1          1 2183_0.89 0.89       388 12234890 12464712 229822 12.23489
## 2          1 3140_0.78 0.78       456 30629866 31142180 512314 30.62987
## 3          1 2180_0.89 0.89       394 33272312 33407854 135542 33.27231
## 4          1 3140_0.78 0.78       456 33522896 33668267 145371 33.52290
## 5          1 2183_0.89 0.89       388 39346067 39477392 131325 39.34607
## 6          1 2991_0.81 0.81       445 44049952 44160436 110484 44.04995
##     End_Mb
## 1 12.46471
## 2 31.14218
## 3 33.40785
## 4 33.66827
## 5 39.47739
## 6 44.16044
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr2", "AUT_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr2",
       "summary_AUT.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(2, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("AUT", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs
## 1          2 2183_0.89 0.89 12234890 12464712       388   455
## 2          2 3140_0.78 0.78 30629866 31142180       456   564
## 3          2 2180_0.89 0.89 33272312 33407854       394   479
## 4          2 3140_0.78 0.78 33522896 33668267       456   564
## 5          2 2183_0.89 0.89 39346067 39477392       388   455
## 6          2 2991_0.81 0.81 44049952 44160436       445   566
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr2", "AUT_plot2.rds"))

5. LDna chromosome 3

Because MAN has only 10 mosquitoes we will not estimate LD for it

4.2 NEW

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells 2725371 145.6    5389527  287.9         NA    5389527  287.9
## Vcells 4865185  37.2  600039580 4578.0      32768 1171951721 8941.3

Import the data (make sure you have enough memory)

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr3", "NEW.chr3.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=6, method = "single")

Save ldna

saveRDS(ldna, file = here("output", "ldna", "pop", "chr3", "NEW.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr3", "NEW_ld2.rds"))
ldna <- readRDS(here("output", "ldna", "pop", "chr3", "NEW.rds"))
ld2 <- readRDS(here("output", "ldna", "pop", "chr3", "NEW_ld2.rds"))

Check the object

str(ldna)
## List of 4
##  $ clusterfile: logi [1:12343, 1:4301] TRUE TRUE TRUE TRUE TRUE TRUE ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:12343] "AX-580321672" "AX-580321676" "AX-580324669" "AX-580324707" ...
##   .. ..$ : chr [1:4301] "4301_0.42" "4300_0.44" "4299_0.45" "4298_0.46" ...
##  $ stats      :'data.frame': 16644 obs. of  6 variables:
##   ..$ cluster       : chr [1:16644] "4301_0.42" "4300_0.44" "4299_0.45" "4298_0.46" ...
##   ..$ parent_cluster: chr [1:16644] "root" "4301_0.42" "4300_0.44" "4299_0.45" ...
##   ..$ distance      : chr [1:16644] "0.42" "0.02" "0.01" "0.01" ...
##   ..$ nV            : num [1:16644] 12343 12341 12338 12337 12336 ...
##   ..$ nE            : num [1:16644] 948957 868807 783325 718563 649427 ...
##   ..$ lambda        : num [1:16644] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tree       :List of 4
##   ..$ edge       : int [1:16643, 1:2] 12344 12344 12344 12345 12345 12345 12345 12346 12346 12347 ...
##   ..$ edge.length: num [1:16643] 0.29 0.29 0.01 0.28 0.28 0.28 0.005 0.275 0.005 0.27 ...
##   ..$ tip.label  : chr [1:12343] "AX-580321672" "AX-580321676" "AX-580324669" "AX-580324707" ...
##   ..$ Nnode      : int 4301
##   ..- attr(*, "class")= chr "phylo"
##   ..- attr(*, "order")= chr "cladewise"
##  $ lambda_min :Classes 'data.table' and 'data.frame':    4300 obs. of  2 variables:
##   ..$ V1: num [1:4300] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.52 0.56 0.01 ...
##   ..$ V2: num [1:4300] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>

We can use the same parameters that we used for the other population

# snp_count <- ncol(ld2)
edges_max <- 100
edges_min <- 20

We can test the number of edges in increments of 10 and use the default phi=2.

pdf(file = here("output", "ldna", "pop", "chr3", paste(edges_min, "3.NEW_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim= 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr3", "summary_NEW.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr3", "summary_NEW.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off()

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)

Save it

max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr3", "NEW_clusters_snps.rds"))
head(df)
##      1685_0.87    1916_0.86    2014_0.85    2016_0.85    2183_0.84    2184_0.84
## 1 AX-581496206 AX-580341248 AX-580363985 AX-580435937 AX-580337416 AX-580463586
## 2 AX-581498050 AX-580366329 AX-580535569 AX-580433304 AX-580348293 AX-580463625
## 3 AX-581498741 AX-580364850 AX-580627341 AX-580449057 AX-580349297 AX-580466269
## 4 AX-581498822 AX-580370586 AX-580742600 AX-580451828 AX-580353512 AX-580464938
## 5 AX-581496988 AX-580375223 AX-580813102 AX-580449281 AX-580360973 AX-580468247
## 6 AX-581499216 AX-580380488 AX-581977448 AX-580503902 AX-580399805 AX-580510394
##      2202_0.84    2203_0.84    2226_0.84    2320_0.83    2533_0.82    2695_0.81
## 1 AX-580572128 AX-580650857 AX-580478169 AX-580409398 AX-580389602 AX-580509747
## 2 AX-580606224 AX-582335801 AX-580505393 AX-580419923 AX-580397918 AX-580604594
## 3 AX-581933084 AX-582336159 AX-580507770 AX-580450171 AX-580400203 AX-580606268
## 4 AX-581983512 AX-582336422 AX-580556157 AX-580454833 AX-580404113 AX-580607973
## 5 AX-582057877 AX-582336505 AX-580585919 AX-580461500 AX-580409114 AX-580649395
## 6 AX-582151512 AX-582354718 AX-580603450 AX-580464442 AX-580412434 AX-580649489
##      2700_0.81    3925_0.72
## 1 AX-580819381 AX-581613911
## 2 AX-580820107 AX-581615619
## 3 AX-580902452 AX-581615851
## 4 AX-580905204 AX-581628303
## 5 AX-580905437 AX-581633368
## 6 AX-580903210 AX-581635832

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##     Cluster          SNP
## 1 1685_0.87 AX-581496206
## 2 1685_0.87 AX-581498050
## 3 1685_0.87 AX-581498741
## 4 1685_0.87 AX-581498822
## 5 1685_0.87 AX-581496988
## 6 1685_0.87 AX-581499216
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr3", "NEW_clusters_snps1.rds"))

Make long format

new_chr3 <- melt(do, na.rm = T, value.name = "value")
head(new_chr3)
##   Var1      Var2        value
## 1    1 1685_0.87 AX-581496206
## 2    2 1685_0.87 AX-581498050
## 3    3 1685_0.87 AX-581498741
## 4    4 1685_0.87 AX-581498822
## 5    5 1685_0.87 AX-581496988
## 6    6 1685_0.87 AX-581499216

Update names

colnames(new_chr3)<- c("v1", "cluster", "SNP")
head(new_chr3)
##   v1   cluster          SNP
## 1  1 1685_0.87 AX-581496206
## 2  2 1685_0.87 AX-581498050
## 3  3 1685_0.87 AX-581498741
## 4  4 1685_0.87 AX-581498822
## 5  5 1685_0.87 AX-581496988
## 6  6 1685_0.87 AX-581499216

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
new_snps_chr <- import_bim(here("output", "ldna", "pop", "NEW.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(new_snps_chr)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(new_chr3, new_snps_chr, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP v1   cluster Chromosome Position
## 1 AX-580337416  1 2183_0.84          3  3575966
## 2 AX-580341248  1 1916_0.86          3  4307552
## 3 AX-580348293  2 2183_0.84          3  6215232
## 4 AX-580349297  3 2183_0.84          3  6355706
## 5 AX-580353512  4 2183_0.84          3  6866266
## 6 AX-580360973  5 2183_0.84          3  8280336

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##     cluster          SNP Position
## 1 2183_0.84 AX-580337416  3575966
## 2 1916_0.86 AX-580341248  4307552
## 3 2183_0.84 AX-580348293  6215232
## 4 2183_0.84 AX-580349297  6355706
## 5 2183_0.84 AX-580353512  6866266
## 6 2183_0.84 AX-580360973  8280336

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##     cluster          SNP Position
## 1 2183_0.84 AX-580337416  3575966
## 2 1916_0.86 AX-580341248  4307552
## 3 2183_0.84 AX-580348293  6215232
## 4 2183_0.84 AX-580349297  6355706
## 5 2183_0.84 AX-580353512  6866266
## 6 2183_0.84 AX-580360973  8280336
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##     cluster Position
## 1 2183_0.84  3575966
## 2 1916_0.86  4307552
## 3 2183_0.84  6215232
## 4 2183_0.84  6355706
## 5 2183_0.84  6866266
## 6 2183_0.84  8280336

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr3", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr3", "SNPs_clusters_NEW_chr3.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr3", "clusters", "sushi_NEW_chr3.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr3/clusters/sushi_NEW_chr3.txt > output/ldna/pop/chr3/clusters/sushi2_NEW_chr3.txt;
head output/ldna/pop/chr3/clusters/sushi2_NEW_chr3.txt
## Cluster  Start   End Size
## 2183_0.84    3575966 3575966 0
## 1916_0.86    4307552 4307552 0
## 2183_0.84    6215232 8280336 2065104
## 2014_0.85    8741873 8741873 0
## 1916_0.86    8798971 11276541    2477570
## 2533_0.82    16401719    17356798    955079
## 1916_0.86    17851890    17851890    0
## 2183_0.84    18199243    18199243    0
## 2533_0.82    18233715    18233715    0
# Define the path
input_path <- here("output", "ldna", "pop", "chr3", "clusters", "sushi2_NEW_chr3.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')


# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 1685_0.87         9
## 2 1916_0.86        43
## 3 2014_0.85        18
## 4 2016_0.85        79
## 5 2183_0.84       169
## 6 2184_0.84        52

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr3",
       "clusters",
       "sushi2_NEW_chr3.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 3 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 3, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr3", "clusters", "NEW_chr3.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 3 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 3, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr3", "clusters", "NEW_chr3b.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can save the data to plot all chromosomes togeter using a facet plot

head(annotated_data)
##     Cluster     Start       End    Size   Start_Mb    End_Mb nSegments
## 1 1685_0.87 223571623 224099736  528113 223.571623 224.09974         9
## 2 1916_0.86 168807623 169084778  277155 168.807623 169.08478        43
## 3 1916_0.86 397477146 397772735  295589 397.477146 397.77274        43
## 4 1916_0.86 364979898 365168835  188937 364.979898 365.16884        43
## 5 1916_0.86   8798971  11276541 2477570   8.798971  11.27654        43
## 6 2014_0.85 426713646 426857944  144298 426.713646 426.85794        18
##   Cluster_with_SNPs
## 1     1685_0.87 (9)
## 2    1916_0.86 (43)
## 3    1916_0.86 (43)
## 4    1916_0.86 (43)
## 5    1916_0.86 (43)
## 6    2014_0.85 (18)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments    Start      End    Size  Start_Mb
## 1          1 2183_0.84 0.84       169  6215232  8280336 2065104  6.215232
## 2          1 1916_0.86 0.86        43  8798971 11276541 2477570  8.798971
## 3          1 2533_0.82 0.82       198 16401719 17356798  955079 16.401719
## 4          1 2183_0.84 0.84       169 19692003 19972094  280091 19.692003
## 5          1 2533_0.82 0.82       198 21872654 22221783  349129 21.872654
## 6          1 2533_0.82 0.82       198 24485456 24859962  374506 24.485456
##      End_Mb
## 1  8.280336
## 2 11.276541
## 3 17.356798
## 4 19.972094
## 5 22.221783
## 6 24.859962
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr3", "NEW_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr3",
       "summary_NEW.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(3, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("NEW", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs
## 1          3 2183_0.84 0.84  6215232  8280336       169   342
## 2          3 1916_0.86 0.86  8798971 11276541        43    66
## 3          3 2533_0.82 0.82 16401719 17356798       198   337
## 4          3 2183_0.84 0.84 19692003 19972094       169   342
## 5          3 2533_0.82 0.82 21872654 22221783       198   337
## 6          3 2533_0.82 0.82 24485456 24859962       198   337
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr3", "NEW_plot2.rds"))

3.4 AUT

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells 2734960 146.1    5389527  287.9         NA    5389527  287.9
## Vcells 4886084  37.3  764850266 5835.4      32768 1171951721 8941.3

Import the data

ld1 <-
  read.delim(
    here("output", "ldna", "pop", "chr3", "AUT.chr3.txt"),
    sep = "\t",
    header = T,
    row.names = 1,
    stringsAsFactors = F,
    check.names = FALSE
  )

Check the LD matrix we built using bash

head(ld1)

The row and column names are correct. Now we can convert the data frame to matrix

# Create the matrix for ldna
ld2<- as.matrix(sapply(ld1, as.numeric)) 

# Get names 
names<- rownames(ld1)
#names<-gsub("-", ".", names)

# Add names
row.names(ld2)<-names

# Remove diagonal values = 1
diag(ld2)=NA 

# Make sure it is only the lower triangle matrix 
ld2[!lower.tri(ld2)] <- NA 

Create a LDna object

# create the LDna object (change the number of cores as needed)
ldna <- LDnaRaw(ld2, mc.cores=6, method = "single")

Check the object

str(ldna)

Save it

saveRDS(ldna, file = here("output", "ldna", "pop", "chr3", "AUT.rds"))
saveRDS(ld2, file = here("output", "ldna", "pop", "chr3", "AUT_ld2.rds"))

Load the data

ldna <- readRDS(here("output", "ldna", "pop", "chr3", "AUT.rds"))
ld2 <- readRDS(here("output", "ldna", "pop", "chr3", "AUT_ld2.rds"))

We can use the same parameters that we used for the other population

# snp_count <- ncol(ld2)
edges_max <- 100
edges_min <- 20

We can test the number of edges in increments of 10 and use the default phi=2

pdf(file = here("output", "ldna", "pop", "chr3", paste(edges_min, "3.AUT_loop.edges.pdf", sep = "_")), width = 20, height = 12)
op <- par(mfcol=c(1,4))

# Adjust the step size here
step_size <- 10  # for example, try a smaller step like 10

for (edges_estimate in seq(edges_min, edges_max, by = step_size)) {
  clusters_count_prev <- 100
  clusters_count <- 50
  
  # The 'if' condition seems redundant here since clusters_count_prev is always set to 100 before the check
  if (clusters_count_prev > clusters_count) {
    print(edges_estimate) # This prints the current edges_estimate, showing that the loop is working
    clusters_count_prev <- clusters_count
  
    # Extract clusters
    clusters <- extractClusters(ldna, LDmat=ld2, min.edges=edges_estimate, lambda.lim= 1, extract=TRUE, plot.graph=TRUE, rm.COCs=TRUE, branch.traversal=TRUE)
    
    # Summarize the clusters
    summary <- summaryLDna(ldna, clusters, ld2)
    
    # Write the summary to a file
    write.table(summary, file = here("output", "ldna", "pop", "chr3", "summary_AUT.txt"), row.names=FALSE, sep="\t", quote=FALSE)
    
    # Update clusters_count with the number of clusters just written
    clusters_count <- nrow(read.delim(here("output", "ldna", "pop", "chr3", "summary_AUT.txt")))
  }
}
## [1] 20
## [1] 30
## [1] 40
## [1] 50
## [1] 60
## [1] 70
## [1] 80
## [1] 90
## [1] 100
# Reset graphical parameters and close all open graphic devices
par(op)
while (!is.null(dev.list())) dev.off()

Get cluster information

# to get the list of names of the clusters
# Find the length of the longest vector
max_length <- max(sapply(clusters$clusters, length))

# Function to pad vectors with NAs to make them the same length
pad_vector_to_max_length <- function(vec, max_length) {
  c(vec, rep(NA, max_length - length(vec)))
}

# Apply the padding function to each vector in the list and combine into a data frame
do1 <- as.data.frame(do.call(cbind, lapply(clusters$clusters, pad_vector_to_max_length, max_length)))

# to get list of snps for each cluster
do <- do.call(cbind, unname(lapply(clusters$clusters, `length<-`, max(lengths(clusters$clusters)))))
colnames(do) <- colnames(do1)

Save it

max_length <- max(sapply(clusters$clusters, length))
# Initialize an empty list
cluster_df_list <- list()

# Loop through each cluster and pad with NAs
for (cluster_name in names(clusters$clusters)) {
  cluster_length <- length(clusters$clusters[[cluster_name]])
  padded_cluster <- c(clusters$clusters[[cluster_name]], rep(NA, max_length - cluster_length))
  cluster_df_list[[cluster_name]] <- padded_cluster
}

# Combine the lists into a data frame
df <- data.frame(cluster_df_list)
# Remove 'X' from column names
names(df) <- make.names(names(df), unique = FALSE)
names(df) <- sub("X", "", names(df), fixed = TRUE)

# Save it
saveRDS(df, file = here("output", "ldna", "pop", "chr3", "AUT_clusters_snps.rds"))
head(df)
##          178_1        376_1    1513_0.94    1519_0.94    1549_0.94    1589_0.93
## 1 AX-580614269 AX-580613111 AX-580392948 AX-581603643 AX-580390796 AX-580324669
## 2 AX-581883048 AX-581825073 AX-580392740 AX-581706856 AX-580392760 AX-580396604
## 3 AX-581909980 AX-581837704 AX-580394715 AX-581708068 AX-580393451 AX-580396762
## 4 AX-581938115 AX-581839712 AX-580395882 AX-581780523 AX-580403465 AX-580394906
## 5 AX-581975531 AX-581846896 AX-580396987 AX-581782550 AX-580416963 AX-580394928
## 6 AX-582002441 AX-581849212 AX-580400745 AX-581786527 AX-580421747 AX-580398234
##      1627_0.93    1674_0.93    1733_0.92    1870_0.91     2003_0.9     2069_0.9
## 1 AX-580321672 AX-580720065 AX-580376111 AX-580613449 AX-581617295 AX-580973311
## 2 AX-580422760 AX-580721710 AX-580401397 AX-580613672 AX-581624266 AX-580973825
## 3 AX-580429083 AX-580739510 AX-580402057 AX-580613995 AX-581627056 AX-580975031
## 4 AX-580488704 AX-581807947 AX-580405912 AX-580631000 AX-581628867 AX-580972238
## 5 AX-580631557 AX-581820891 AX-580406511 AX-580649488 AX-581639524 AX-580976312
## 6 AX-580636145 AX-581837002 AX-580409016 AX-580685769 AX-581732427 AX-580976350
##      2141_0.89    2146_0.89    2235_0.89    2305_0.88    2306_0.88    2337_0.88
## 1 AX-580324753 AX-580471701 AX-580390215 AX-580394539 AX-580409860 AX-580637321
## 2 AX-580394160 AX-580614956 AX-580394672 AX-580436239 AX-580409919 AX-580639693
## 3 AX-580393602 AX-580614976 AX-580405542 AX-580585687 AX-580426060 AX-580640208
## 4 AX-580397162 AX-580613817 AX-580408458 AX-580587903 AX-580447846 AX-580647993
## 5 AX-580401483 AX-580633783 AX-580436466 AX-580630014 AX-580465708 AX-580654626
## 6 AX-580402498 AX-580635708 AX-580444913 AX-580631341 AX-580485127 AX-580660735
##      2360_0.88    2428_0.87    2430_0.87    2434_0.87    2438_0.87    2439_0.87
## 1 AX-580471358 AX-580474900 AX-581358760 AX-580643632 AX-580396200 AX-580460220
## 2 AX-580478169 AX-580606601 AX-581498193 AX-582056607 AX-580400243 AX-580496983
## 3 AX-580986394 AX-580636557 AX-581553474 AX-582097904 AX-580406904 AX-580628035
## 4 AX-580984011 AX-580653236 AX-581671691 AX-582102976 AX-580456169 AX-580630113
## 5 AX-581506747 AX-580678854 AX-581701932 AX-582113438 AX-580481521 AX-580649455
## 6 AX-581506896 AX-580713043 AX-581747378 AX-582120761 AX-580496842 AX-580659167
##      2440_0.87    2465_0.87    2466_0.87    2476_0.87    2491_0.87    2517_0.86
## 1 AX-580403653 AX-580321676 AX-580698601 AX-580389168 AX-580612984 AX-580693023
## 2 AX-580404486 AX-580324707 AX-580726109 AX-580403951 AX-580611702 AX-580697452
## 3 AX-580408576 AX-580388940 AX-580732306 AX-580403982 AX-580989602 AX-580699058
## 4 AX-580432559 AX-580389602 AX-580732091 AX-580409382 AX-580990270 AX-580701978
## 5 AX-580433097 AX-580394757 AX-580739000 AX-580413599 AX-580992518 AX-580702148
## 6 AX-580461887 AX-580393702 AX-580739323 AX-580449057 AX-581567910 AX-580705627
##      2559_0.86    2569_0.86    2579_0.86    2613_0.86    2615_0.85    2618_0.85
## 1 AX-582038845 AX-580637230 AX-580391907 AX-580322681 AX-580609674 AX-580637233
## 2 AX-582046269 AX-580649886 AX-580402163 AX-580322843 AX-580639127 AX-580643885
## 3 AX-582052050 AX-580681344 AX-580406767 AX-580323011 AX-580644427 AX-580706944
## 4 AX-582053708 AX-580820107 AX-580407714 AX-580326021 AX-580643422 AX-581684656
## 5 AX-582057824 AX-581757255 AX-580482040 AX-580323896 AX-580651015 AX-581687058
## 6 AX-582066457 AX-581792294 AX-580490392 AX-580323966 AX-580650208 AX-581709703
##      2629_0.85    2630_0.85    2634_0.85    2636_0.85    2670_0.85    2672_0.85
## 1 AX-580396726 AX-580698011 AX-580611533 AX-580628025 AX-581560081 AX-580391029
## 2 AX-580399805 AX-580699405 AX-581603886 AX-580644165 AX-581635374 AX-580397889
## 3 AX-580401100 AX-580707249 AX-581670426 AX-580652448 AX-581637708 AX-580398984
## 4 AX-580405209 AX-580709726 AX-581732023 AX-580652092 AX-581639029 AX-580403568
## 5 AX-580403064 AX-580714281 AX-581733122 AX-580661447 AX-581638053 AX-580403904
## 6 AX-580407468 AX-580712946 AX-581846047 AX-580661554 AX-581638393 AX-580404264
##      2677_0.85    2680_0.85    2682_0.85    2696_0.84    2699_0.84    2700_0.84
## 1 AX-581822955 AX-580437463 AX-581503303 AX-580401315 AX-580403431 AX-580324728
## 2 AX-581965424 AX-580440314 AX-581540858 AX-580435631 AX-580403424 AX-580403347
## 3 AX-581965662 AX-580465328 AX-581739749 AX-580505377 AX-580414240 AX-580403733
## 4 AX-581967184 AX-580479328 AX-581741137 AX-580508688 AX-580424154 AX-580416937
## 5 AX-581967530 AX-580488345 AX-581741441 AX-580509381 AX-580444443 AX-580424773
## 6 AX-581991857 AX-580502401 AX-581741462 AX-580515997 AX-580459296 AX-580431662
##      2742_0.84    2772_0.83    2773_0.83    2776_0.83    2779_0.83    2807_0.83
## 1 AX-580408746 AX-580606517 AX-580692997 AX-580986438 AX-580393300 AX-581288605
## 2 AX-580429752 AX-580635725 AX-580698397 AX-580987713 AX-580403237 AX-581293360
## 3 AX-580429861 AX-580635730 AX-580725355 AX-580993953 AX-580404862 AX-581292774
## 4 AX-580687524 AX-580636117 AX-580728287 AX-580990798 AX-580490746 AX-581296670
## 5 AX-580687274 AX-580639999 AX-580728675 AX-580992198 AX-580497064 AX-581300336
## 6 AX-580688629 AX-580649489 AX-580728690 AX-580994585 AX-580497106 AX-581300406
##      2808_0.83    2846_0.82    2849_0.82    2856_0.82    2858_0.82    2861_0.82
## 1 AX-580390942 AX-580444656 AX-580758054 AX-580606553 AX-581304069 AX-580395610
## 2 AX-580404664 AX-580726685 AX-580758545 AX-580608605 AX-581304151 AX-580400834
## 3 AX-580405165 AX-580729651 AX-580755426 AX-580609579 AX-581302943 AX-580404113
## 4 AX-580456408 AX-580729451 AX-580760778 AX-580614818 AX-581305428 AX-580407452
## 5 AX-580463169 AX-580729782 AX-580762034 AX-580616627 AX-581308442 AX-580445827
## 6 AX-580464977 AX-580732137 AX-580765541 AX-580616657 AX-581309923 AX-580505263
##      2876_0.82    2895_0.82    2898_0.82    2901_0.81    2921_0.81    2923_0.81
## 1 AX-580393848 AX-580555010 AX-581142855 AX-580390202 AX-581765913 AX-580539469
## 2 AX-580403143 AX-580552692 AX-581187704 AX-580400203 AX-581781057 AX-580577985
## 3 AX-580403859 AX-580555171 AX-581188351 AX-580420295 AX-581821709 AX-580609940
## 4 AX-580431221 AX-580553179 AX-581184742 AX-580423579 AX-581825083 AX-580675861
## 5 AX-580431375 AX-580553188 AX-581189839 AX-580446263 AX-581833439 AX-580698516
## 6 AX-580428768 AX-580554541 AX-581190771 AX-580486272 AX-581849264 AX-580723951
##      2926_0.81     2954_0.8     2957_0.8     2958_0.8     2968_0.8     2978_0.8
## 1 AX-581765172 AX-580813960 AX-580696904 AX-580815882 AX-581084049 AX-580691967
## 2 AX-581773463 AX-580813102 AX-580713495 AX-580929656 AX-581087577 AX-581239552
## 3 AX-582057332 AX-580817509 AX-580728036 AX-580932335 AX-581090484 AX-581248349
## 4 AX-582073358 AX-580820330 AX-580726970 AX-580929829 AX-581091273 AX-581244552
## 5 AX-582087479 AX-580828151 AX-580728596 AX-580940732 AX-581095861 AX-581252179
## 6 AX-582305358 AX-580830161 AX-581793617 AX-580949586 AX-581094618 AX-581250090
##      2986_0.79    3018_0.79    3022_0.78    3023_0.78    3034_0.78    3046_0.77
## 1 AX-580668472 AX-581396708 AX-580551388 AX-580765551 AX-581541862 AX-580321736
## 2 AX-580988058 AX-581397081 AX-580551410 AX-580811408 AX-581737962 AX-580400346
## 3 AX-580992347 AX-581397377 AX-580554382 AX-580815631 AX-581737980 AX-580406692
## 4 AX-580993086 AX-581396415 AX-580554828 AX-580828541 AX-581739183 AX-580412434
## 5 AX-580989651 AX-581399643 AX-580553411 AX-580838574 AX-581738020 AX-580459719
## 6 AX-580994398 AX-581399706 AX-580555961 AX-580840039 AX-581739439 AX-580464499
##      3048_0.77    3049_0.77    3050_0.77    3058_0.77    3059_0.77    3076_0.77
## 1 AX-580385855 AX-581020972 AX-580345500 AX-580733267 AX-580886052 AX-580325617
## 2 AX-582238865 AX-581046789 AX-580348573 AX-580735708 AX-580902182 AX-580325637
## 3 AX-582239067 AX-581056133 AX-580349307 AX-580734693 AX-580907427 AX-580322712
## 4 AX-582239406 AX-581057769 AX-580351597 AX-580741085 AX-580910312 AX-580325736
## 5 AX-582239412 AX-581057847 AX-580355834 AX-580742377 AX-580908672 AX-580326281
## 6 AX-582240806 AX-581058565 AX-580357141 AX-580742602 AX-580908882 AX-580323342
##      3080_0.76    3087_0.76    3105_0.75    3119_0.74    3139_0.73    3140_0.73
## 1 AX-580892444 AX-581425963 AX-580329430 AX-580734902 AX-580359720 AX-580346893
## 2 AX-581018913 AX-581426316 AX-580330660 AX-580746638 AX-580359737 AX-580349646
## 3 AX-581019153 AX-581424333 AX-580334614 AX-580750088 AX-580357929 AX-580348040
## 4 AX-581046801 AX-581424472 AX-580335187 AX-580757935 AX-580361874 AX-580349996
## 5 AX-581073042 AX-581424635 AX-580335504 AX-580765447 AX-580363916 AX-580357799
## 6 AX-581070954 AX-581428312 AX-580334973 AX-580772677 AX-580363991 AX-580362805
##      3143_0.73    3146_0.73    3147_0.73    3154_0.72    3160_0.72    3161_0.72
## 1 AX-581225628 AX-580322728 AX-581023125 AX-581231004 AX-581244779 AX-581481903
## 2 AX-581227484 AX-580322900 AX-581025377 AX-581249253 AX-581242711 AX-581483002
## 3 AX-581231082 AX-580325783 AX-581029046 AX-581253352 AX-581243871 AX-581484904
## 4 AX-581243592 AX-580326421 AX-581029303 AX-581255656 AX-581251705 AX-581483676
## 5 AX-581243710 AX-580328808 AX-581048156 AX-581258267 AX-581248951 AX-581485368
## 6 AX-581247699 AX-580331003 AX-581073288 AX-581258422 AX-581253253 AX-581485333
##      3170_0.71    3172_0.71    3201_0.67    3228_0.62
## 1 AX-581218111 AX-580338227 AX-580388195 AX-581489603
## 2 AX-581219933 AX-580339155 AX-581446592 AX-581487764
## 3 AX-581226872 AX-580337203 AX-581447178 AX-581487882
## 4 AX-581223687 AX-580339228 AX-581445080 AX-581487998
## 5 AX-581227351 AX-580337690 AX-581447458 AX-581488057
## 6 AX-581223737 AX-580337709 AX-581446097 AX-581488185

We can save it in a different format as well

# Assuming your data frame is named df
long_df <- gather(df, Cluster, SNP, everything())

# View the first few rows of the transformed data
head(long_df)
##   Cluster          SNP
## 1   178_1 AX-580614269
## 2   178_1 AX-581883048
## 3   178_1 AX-581909980
## 4   178_1 AX-581938115
## 5   178_1 AX-581975531
## 6   178_1 AX-582002441
# Save it
saveRDS(long_df, file = here("output", "ldna", "pop", "chr3", "AUT_clusters_snps1.rds"))

Make long format

aut_chr2 <- melt(do, na.rm = T, value.name = "value")
head(aut_chr2)
##   Var1  Var2        value
## 1    1 178_1 AX-580614269
## 2    2 178_1 AX-581883048
## 3    3 178_1 AX-581909980
## 4    4 178_1 AX-581938115
## 5    5 178_1 AX-581975531
## 6    6 178_1 AX-582002441

Update names

colnames(aut_chr2)<- c("v1", "cluster", "SNP")
head(aut_chr2)
##   v1 cluster          SNP
## 1  1   178_1 AX-580614269
## 2  2   178_1 AX-581883048
## 3  3   178_1 AX-581909980
## 4  4   178_1 AX-581938115
## 5  5   178_1 AX-581975531
## 6  6   178_1 AX-582002441

Import the bim file to get the SNP positions

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
aut_snps_chr <- import_bim(here("output", "ldna", "pop", "AUT.bim")) |>
  dplyr::select(
    SNP, Scaffold, Position
  ) |>
  dplyr::rename(
    Chromosome = Scaffold
  )

# Check it
head(aut_snps_chr)
## # A tibble: 6 × 3
##   SNP          Chromosome Position
##   <chr>        <chr>         <dbl>
## 1 AX-583033342 1            315059
## 2 AX-583035163 1            315386
## 3 AX-583035194 1            330265
## 4 AX-583035257 1            442875
## 5 AX-583035355 1            540754
## 6 AX-583034838 1           1138155

Merge the cluster data and the SNP data

# merge dataframes
merged<- merge(aut_chr2, aut_snps_chr, by="SNP", all.x=TRUE)
merged<-na.omit(merged)
head(merged)
##            SNP v1   cluster Chromosome Position
## 1 AX-580321672  1 1627_0.93          3    89593
## 2 AX-580321676  1 2465_0.87          3    89854
## 3 AX-580321736  1 3046_0.77          3   134811
## 4 AX-580322681  1 2613_0.86          3  1035972
## 5 AX-580322712  3 3076_0.77          3  1036675
## 6 AX-580322728  1 3146_0.73          3  1036937

Select the columns we need

# subset
merged <- subset(merged, select = c(cluster, SNP, Position))
head(merged)
##     cluster          SNP Position
## 1 1627_0.93 AX-580321672    89593
## 2 2465_0.87 AX-580321676    89854
## 3 3046_0.77 AX-580321736   134811
## 4 2613_0.86 AX-580322681  1035972
## 5 3076_0.77 AX-580322712  1036675
## 6 3146_0.73 AX-580322728  1036937

Sort by distance

merged<- merged[order(merged$Position),,drop=FALSE]
head(merged)
##      cluster          SNP Position
## 1  1627_0.93 AX-580321672    89593
## 2  2465_0.87 AX-580321676    89854
## 20 1589_0.93 AX-580324669    90693
## 21 2465_0.87 AX-580324707   125814
## 3  3046_0.77 AX-580321736   134811
## 22 2700_0.84 AX-580324728   138261
# for parsing later
sushi1<- subset(merged, select = c(cluster, Position))
head(sushi1)
##      cluster Position
## 1  1627_0.93    89593
## 2  2465_0.87    89854
## 20 1589_0.93    90693
## 21 2465_0.87   125814
## 3  3046_0.77   134811
## 22 2700_0.84   138261

Save the data

# Create directory
new_directory_path <- here("output", "ldna", "pop", "chr3", "clusters")
dir.create(new_directory_path, recursive = TRUE)

# save file
write.table(merged, file = here("output", "ldna", "pop", "chr3", "SNPs_clusters_AUT_chr3.txt"), row.names = F, sep = "\t", quote = F)
write.table(sushi1, file = here("output", "ldna", "pop", "chr3", "clusters", "sushi_AUT_chr3.txt"), row.names = F, sep = "\t", quote = F)

Parse the file1 to get the start and end of the clusters as well as their size. Remember, recombination make it a mosaic, so the clusters are “mixed”.

awk '
function print_row() {
    if ( feature != "" )
       print feature, start, end, (end - start)
}

BEGIN {
    FS=OFS="\t";
    print "Cluster", "Start", "End", "Size";
}
NR == 1 { next } # Skip the first line
$1 != feature {
    print_row();
    feature = $1;
    start = $2;
    end = $2;
    next;
}
{
    end = $2;
}
END {
    print_row();
}
' output/ldna/pop/chr3/clusters/sushi_AUT_chr3.txt > output/ldna/pop/chr3/clusters/sushi2_AUT_chr3.txt;
head output/ldna/pop/chr3/clusters/sushi2_AUT_chr3.txt
## Cluster  Start   End Size
## 1627_0.93    89593   89593   0
## 2465_0.87    89854   89854   0
## 1589_0.93    90693   90693   0
## 2465_0.87    125814  125814  0
## 3046_0.77    134811  134811  0
## 2700_0.84    138261  138261  0
## 2141_0.89    145354  145354  0
## 3076_0.77    1015103 1017316 2213
## 2613_0.86    1035972 1035972 0

Get SNP count

# Define the path
input_path <- here("output", "ldna", "pop", "chr3", "clusters", "sushi2_AUT_chr3.txt")

# Read the data into R
clusters_data <- read_table(input_path, col_names = TRUE, col_types = NULL) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Cluster = col_character(),
##   Start = col_double(),
##   End = col_double(),
##   Size = col_double()
## )
# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Count segments for each cluster - if you want only the counts
segment_counts <- clusters_data %>%
  group_by(Cluster) %>%
  summarize(nSegments = n(), .groups = 'drop')

# View the result
head(segment_counts)
## # A tibble: 6 × 2
##   Cluster   nSegments
##   <chr>         <int>
## 1 1513_0.94       166
## 2 1519_0.94        67
## 3 1549_0.94        79
## 4 1589_0.93       270
## 5 1627_0.93       133
## 6 1674_0.93        73

Plot it

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr3",
       "clusters",
       "sushi2_AUT_chr3.txt")

# Read the data into R
clusters_data <- read.table(input_path, header = TRUE, sep = "\t") |>
  arrange(Start)

# Arrange by position
clusters_data <- clusters_data |>
  arrange(Start)

# Filter out rows with Size equal to 0
clusters_data <- subset(clusters_data, Size > 100000) # show only bigger than 100kb


# Calculate the maximum size for scaling
max_size <- max(clusters_data$Size)

# Create Start_Mb and End_Mb within clusters_data
clusters_data$Start_Mb <- clusters_data$Start / 1e6
clusters_data$End_Mb <- clusters_data$End / 1e6

# Create the plot
ggplot(clusters_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  scale_y_continuous(labels = label_number(unit = "M"), breaks = pretty_breaks(n = 10)) +
  labs(x = "Chromosome 3 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(ncol = 7, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr3", "clusters", "AUT_chr3.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can annotate the number of segments in the legend.

# Merge SNP counts into clusters_data while specifying suffixes
annotated_data <- merge(clusters_data, segment_counts, by = "Cluster")

# Create a new column with Cluster names and SNP counts
annotated_data$Cluster_with_SNPs <- with(annotated_data, paste(Cluster, " (", nSegments, ")", sep=""))

# Now the plot uses the new Cluster_with_SNPs for the fill legend
ggplot(annotated_data, aes(xmin = Start_Mb, xmax = End_Mb, ymin = 0, ymax = Size / 1e6)) +
  geom_rect(aes(fill = as.factor(Cluster_with_SNPs)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  scale_y_continuous(labels = scales::label_number(unit = "M"), breaks = scales::pretty_breaks(n = 10)) +
  labs(x = "Chromosome 3 (Mb)", y = "Cluster Size (Mb)", fill = "Cluster ID") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
    legend.box = "horizontal"
  ) +
  guides(fill = guide_legend(nrow = 10, title.position = "top", title.hjust = 0.5))

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "pop", "chr3", "clusters", "AUT_chr3b.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

Save plotting data, we can arrange the column order first

plot_data <- annotated_data |>
  dplyr::select(
    Cluster, nSegments, Start, End, Size, Start_Mb, End_Mb
  )

# Create a new column with Chromosome
plot_data <- data.frame(Chromosome = rep(1, nrow(plot_data)), plot_data)

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
plot_data$r2 <- sapply(strsplit(as.character(plot_data$Cluster), "_"), `[`, 2)

# Move the 'r2' column to be the second column, right after 'Chromosome'
plot_data <- plot_data[, c(1:2, ncol(plot_data), 3:(ncol(plot_data)-1))]


# Arrange
plot_data <- plot_data |>
  dplyr::arrange(Start)

# Check it
head(plot_data)
##   Chromosome   Cluster   r2 nSegments   Start     End   Size Start_Mb   End_Mb
## 1          1 3076_0.77 0.77        12 1188868 1328413 139545 1.188868 1.328413
## 2          1 3105_0.75 0.75        18 2696176 2814582 118406 2.696176 2.814582
## 3          1 3105_0.75 0.75        18 3196996 3304359 107363 3.196996 3.304359
## 4          1 3172_0.71 0.71        93 3531140 3719631 188491 3.531140 3.719631
## 5          1 3105_0.75 0.75        18 4483499 4911455 427956 4.483499 4.911455
## 6          1 3146_0.73 0.73        19 5016717 5117973 101256 5.016717 5.117973
# Save it
saveRDS(plot_data, file = here("output", "ldna", "pop", "chr3", "AUT_plot.rds"))

We can also import the summary file to create a object with SNP counts per segment

# Define the path
input_path <-
  here("output",
       "ldna",
       "pop",
       "chr3",
       "summary_AUT.txt")

# Read the data into R
clusters_snps <- read.table(input_path, header = TRUE, sep = "\t")

# We subset and rename the column 1
clusters_snps <- clusters_snps |>
  dplyr::select(
    Name, nLoci
  ) |>
  dplyr::rename(
    Cluster = Name,
    nSNPs = nLoci
  )

# Merge
cluster_seg_snp <- inner_join(segment_counts, clusters_snps, by = "Cluster")
cluster_seg_snp <- inner_join(clusters_data, cluster_seg_snp, by = "Cluster")

# Split the 'Cluster' column at "_", and create the 'r2' column with the second part
cluster_seg_snp$r2 <- sapply(strsplit(as.character(cluster_seg_snp$Cluster), "_"), `[`, 2)

# Add chromosome number and population name
cluster_seg_snp <- data.frame(Chromosome = rep(3, nrow(cluster_seg_snp)), cluster_seg_snp)
cluster_seg_snp <- data.frame(Population = rep("AUT", nrow(cluster_seg_snp)), cluster_seg_snp)


# Reorder the columns
cluster_seg_snp <- cluster_seg_snp |>
  dplyr::select(
    Chromosome, Cluster, r2, Start, End, nSegments, nSNPs
  ) |> 
  dplyr::arrange(
    Chromosome, Start
  )

head(cluster_seg_snp)
##   Chromosome   Cluster   r2   Start     End nSegments nSNPs
## 1          3 3076_0.77 0.77 1188868 1328413        12    24
## 2          3 3105_0.75 0.75 2696176 2814582        18    43
## 3          3 3105_0.75 0.75 3196996 3304359        18    43
## 4          3 3172_0.71 0.71 3531140 3719631        93   162
## 5          3 3105_0.75 0.75 4483499 4911455        18    43
## 6          3 3146_0.73 0.73 5016717 5117973        19    23
# Save the data for plotting later
saveRDS(cluster_seg_snp, file = here("output", "ldna", "pop", "chr3", "AUT_plot2.rds"))

6. Combine the chromosomal data

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells 2737754 146.3    5389527  287.9         NA    5389527  287.9
## Vcells 4894221  37.4  563998916 4303.0      32768 1171951721 8941.3

Read the plotting data for NEW and AUT

# NEW
NEW1 <- readRDS(here("output", "ldna", "pop", "chr1", "NEW_plot2.rds")) |> mutate(Population = "NEW")
NEW2 <- readRDS(here("output", "ldna", "pop", "chr2", "NEW_plot2.rds")) |> mutate(Population = "NEW")
NEW3 <- readRDS(here("output", "ldna", "pop", "chr3", "NEW_plot2.rds")) |> mutate(Population = "NEW") 

# AUT
AUT1 <- readRDS(here("output", "ldna", "pop", "chr1", "AUT_plot2.rds")) |> mutate(Population = "AUT")
AUT2 <- readRDS(here("output", "ldna", "pop", "chr2", "AUT_plot2.rds")) |> mutate(Population = "AUT") 
AUT3 <- readRDS(here("output", "ldna", "pop", "chr3", "AUT_plot2.rds")) |> mutate(Population = "AUT") 

Bind the objects

albo <- rbind(NEW1, NEW2, NEW3, AUT1, AUT2, AUT3)

head(albo)
##   Chromosome   Cluster   r2    Start      End nSegments nSNPs Population
## 1          1 1679_0.85 0.85 11090350 11359739        39    41        NEW
## 2          1 1554_0.86 0.86 11618085 16238366        41    50        NEW
## 3          1 1872_0.83 0.83 30793588 30946077        77    95        NEW
## 4          1 1788_0.84 0.84 31157406 31305550       266   353        NEW
## 5          1 1677_0.85 0.85 32809265 32964732       116   137        NEW
## 6          1 1800_0.84 0.84 35740120 36594644       103   121        NEW

We can create a table

table1 <- albo |>
  mutate(Size = End - Start) |>
  dplyr::select(
    Population, Chromosome, Cluster, r2, nSegments, nSNPs, Start, End, Size
  ) |>
  arrange(Population, Chromosome, Start)
head(table1)
##   Population Chromosome   Cluster   r2 nSegments nSNPs    Start      End
## 1        AUT          1  1972_0.6  0.6        20   124   442875  2339987
## 2        AUT          1  1972_0.6  0.6        20   124  2935811  3536906
## 3        AUT          1 1844_0.71 0.71        20    45  3766945  4238554
## 4        AUT          1 1915_0.66 0.66        23    30  4981859  5217622
## 5        AUT          1 1844_0.71 0.71        20    45  5443736 10957198
## 6        AUT          1 1952_0.63 0.63        20    31 11125830 11276238
##      Size
## 1 1897112
## 2  601095
## 3  471609
## 4  235763
## 5 5513462
## 6  150408

We can focus on LD blocks equal or bigger than 1Mb

table1 <- table1 %>%
  dplyr::filter(Size >= 1000000) %>%
  dplyr::mutate(`Size (Mb)` = round(Size / 1000000, 2)) # %>%
  # dplyr::select(Size)
head(table1)
##   Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 1        AUT          1  1972_0.6  0.6        20   124    442875   2339987
## 2        AUT          1 1844_0.71 0.71        20    45   5443736  10957198
## 3        AUT          1 1777_0.76 0.76        45    61  87157392  88231134
## 4        AUT          1 1693_0.79 0.79        25    27 102144753 103394842
## 5        AUT          1 1596_0.82 0.82        21    50 119130691 120305920
## 6        AUT          1 1433_0.85 0.85        18    33 131161343 132428189
##      Size Size (Mb)
## 1 1897112      1.90
## 2 5513462      5.51
## 3 1073742      1.07
## 4 1250089      1.25
## 5 1175229      1.18
## 6 1266846      1.27

Create table

# Create the flextable
my_flextable <- flextable(table1)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Population

Chromosome

Cluster

r2

nSegments

nSNPs

Start

End

Size

Size (Mb)

AUT

1

1972_0.6

0.6

20

124

442,875

2,339,987

1,897,112

1.90

AUT

1

1844_0.71

0.71

20

45

5,443,736

10,957,198

5,513,462

5.51

AUT

1

1777_0.76

0.76

45

61

87,157,392

88,231,134

1,073,742

1.07

AUT

1

1693_0.79

0.79

25

27

102,144,753

103,394,842

1,250,089

1.25

AUT

1

1596_0.82

0.82

21

50

119,130,691

120,305,920

1,175,229

1.18

AUT

1

1433_0.85

0.85

18

33

131,161,343

132,428,189

1,266,846

1.27

AUT

1

1942_0.64

0.64

22

88

138,373,561

140,570,398

2,196,837

2.20

AUT

1

1528_0.84

0.84

28

38

171,598,966

172,728,015

1,129,049

1.13

AUT

1

1528_0.84

0.84

28

38

188,134,463

189,136,637

1,002,174

1.00

AUT

1

1124_0.9

0.9

211

248

260,581,858

262,016,624

1,434,766

1.43

AUT

1

1948_0.63

0.63

21

90

266,160,984

270,116,064

3,955,080

3.96

AUT

1

1972_0.6

0.6

20

124

276,166,620

280,693,512

4,526,892

4.53

AUT

1

1972_0.6

0.6

20

124

281,430,183

285,327,100

3,896,917

3.90

AUT

2

2185_0.89

0.89

190

213

123,252,079

124,310,386

1,058,307

1.06

AUT

2

2180_0.89

0.89

394

479

149,078,628

151,226,909

2,148,281

2.15

AUT

2

3403_0.67

0.67

19

54

322,962,735

325,035,189

2,072,454

2.07

AUT

2

3457_0.6

0.6

30

66

328,800,255

329,979,871

1,179,616

1.18

AUT

2

3139_0.78

0.78

70

316

390,518,548

394,323,260

3,804,712

3.80

AUT

2

3139_0.78

0.78

70

316

395,917,900

402,963,765

7,045,865

7.05

AUT

2

3139_0.78

0.78

70

316

405,006,630

407,224,834

2,218,204

2.22

AUT

2

3063_0.8

0.8

25

61

407,321,663

412,997,317

5,675,654

5.68

AUT

3

2465_0.87

0.87

422

507

23,447,984

24,876,231

1,428,247

1.43

AUT

3

2629_0.85

0.85

114

128

34,770,651

36,900,915

2,130,264

2.13

AUT

3

2636_0.85

0.85

69

82

143,567,637

144,911,025

1,343,388

1.34

AUT

3

2807_0.83

0.83

81

213

199,465,149

200,641,223

1,176,074

1.18

AUT

3

2807_0.83

0.83

81

213

201,927,576

204,102,298

2,174,722

2.17

AUT

3

3147_0.73

0.73

47

89

214,452,458

215,749,074

1,296,616

1.30

AUT

3

3201_0.67

0.67

15

72

216,985,634

219,602,266

2,616,632

2.62

NEW

1

1554_0.86

0.86

41

50

11,618,085

16,238,366

4,620,281

4.62

NEW

1

1788_0.84

0.84

266

353

46,848,250

49,120,885

2,272,635

2.27

NEW

1

1788_0.84

0.84

266

353

75,613,715

77,269,296

1,655,581

1.66

NEW

1

1560_0.86

0.86

93

109

88,407,493

90,438,527

2,031,034

2.03

NEW

1

1788_0.84

0.84

266

353

93,547,558

95,297,714

1,750,156

1.75

NEW

1

1782_0.84

0.84

93

119

105,482,568

107,443,193

1,960,625

1.96

NEW

1

1317_0.88

0.88

17

22

169,179,646

170,784,441

1,604,795

1.60

NEW

1

1782_0.84

0.84

93

119

208,024,693

209,090,551

1,065,858

1.07

NEW

1

1146_0.89

0.89

62

81

226,898,202

228,109,207

1,211,005

1.21

NEW

2

2543_0.84

0.84

372

674

17,612,329

18,832,351

1,220,022

1.22

NEW

2

2748_0.83

0.83

161

208

36,257,115

37,270,860

1,013,745

1.01

NEW

2

2543_0.84

0.84

372

674

61,309,812

63,385,698

2,075,886

2.08

NEW

2

2545_0.84

0.84

351

633

82,822,354

84,538,637

1,716,283

1.72

NEW

2

2545_0.84

0.84

351

633

129,240,899

132,723,328

3,482,429

3.48

NEW

2

2545_0.84

0.84

351

633

136,289,009

137,527,090

1,238,081

1.24

NEW

2

2624_0.84

0.84

15

37

159,391,505

160,872,568

1,481,063

1.48

NEW

2

2624_0.84

0.84

15

37

160,918,536

164,996,020

4,077,484

4.08

NEW

2

2748_0.83

0.83

161

208

192,585,621

193,792,689

1,207,068

1.21

NEW

2

2713_0.83

0.83

34

39

200,640,262

202,278,042

1,637,780

1.64

NEW

2

3951_0.75

0.75

38

62

203,165,026

204,246,205

1,081,179

1.08

NEW

2

2748_0.83

0.83

161

208

229,785,687

232,287,067

2,501,380

2.50

NEW

2

2748_0.83

0.83

161

208

232,970,945

234,100,828

1,129,883

1.13

NEW

2

2545_0.84

0.84

351

633

247,327,976

248,553,203

1,225,227

1.23

NEW

2

3426_0.79

0.79

240

326

263,127,511

264,361,900

1,234,389

1.23

NEW

2

2545_0.84

0.84

351

633

267,169,541

269,283,227

2,113,686

2.11

NEW

2

2545_0.84

0.84

351

633

280,710,015

283,869,411

3,159,396

3.16

NEW

2

2545_0.84

0.84

351

633

300,659,500

304,360,746

3,701,246

3.70

NEW

2

3951_0.75

0.75

38

62

306,590,966

308,633,302

2,042,336

2.04

NEW

2

2545_0.84

0.84

351

633

321,501,691

323,875,574

2,373,883

2.37

NEW

2

2748_0.83

0.83

161

208

343,930,528

351,760,687

7,830,159

7.83

NEW

2

2995_0.82

0.82

4

39

365,887,578

368,253,627

2,366,049

2.37

NEW

2

2545_0.84

0.84

351

633

370,117,893

375,162,415

5,044,522

5.04

NEW

2

2545_0.84

0.84

351

633

407,123,516

415,257,912

8,134,396

8.13

NEW

2

2545_0.84

0.84

351

633

416,477,946

420,091,640

3,613,694

3.61

NEW

2

2545_0.84

0.84

351

633

423,480,817

425,459,834

1,979,017

1.98

NEW

2

2545_0.84

0.84

351

633

433,356,850

434,926,662

1,569,812

1.57

NEW

2

2543_0.84

0.84

372

674

464,468,067

465,572,729

1,104,662

1.10

NEW

2

3426_0.79

0.79

240

326

475,606,473

477,615,525

2,009,052

2.01

NEW

2

2543_0.84

0.84

372

674

480,159,534

481,378,592

1,219,058

1.22

NEW

2

2543_0.84

0.84

372

674

483,395,086

484,674,520

1,279,434

1.28

NEW

2

2543_0.84

0.84

372

674

489,711,160

491,614,193

1,903,033

1.90

NEW

2

2545_0.84

0.84

351

633

514,915,553

516,163,725

1,248,172

1.25

NEW

2

2543_0.84

0.84

372

674

520,801,817

522,038,684

1,236,867

1.24

NEW

2

2545_0.84

0.84

351

633

540,102,890

541,125,553

1,022,663

1.02

NEW

2

2543_0.84

0.84

372

674

549,401,789

550,492,268

1,090,479

1.09

NEW

2

3426_0.79

0.79

240

326

551,614,176

553,072,100

1,457,924

1.46

NEW

2

2544_0.84

0.84

52

79

584,580,570

585,804,603

1,224,033

1.22

NEW

3

2183_0.84

0.84

169

342

6,215,232

8,280,336

2,065,104

2.07

NEW

3

1916_0.86

0.86

43

66

8,798,971

11,276,541

2,477,570

2.48

NEW

3

2533_0.82

0.82

198

337

33,859,167

36,902,893

3,043,726

3.04

NEW

3

2533_0.82

0.82

198

337

46,782,114

48,486,026

1,703,912

1.70

NEW

3

2533_0.82

0.82

198

337

76,622,043

77,953,224

1,331,181

1.33

NEW

3

2183_0.84

0.84

169

342

88,286,681

89,538,915

1,252,234

1.25

NEW

3

2533_0.82

0.82

198

337

96,387,529

97,703,905

1,316,376

1.32

NEW

3

2533_0.82

0.82

198

337

103,978,542

105,140,618

1,162,076

1.16

NEW

3

2320_0.83

0.83

185

260

108,099,773

109,990,109

1,890,336

1.89

NEW

3

2533_0.82

0.82

198

337

127,896,703

133,213,688

5,316,985

5.32

NEW

3

2700_0.81

0.81

34

51

133,371,924

134,578,256

1,206,332

1.21

NEW

3

2533_0.82

0.82

198

337

143,504,855

146,770,743

3,265,888

3.27

NEW

3

2533_0.82

0.82

198

337

155,504,549

159,797,863

4,293,314

4.29

NEW

3

2533_0.82

0.82

198

337

162,731,902

163,859,867

1,127,965

1.13

NEW

3

2533_0.82

0.82

198

337

163,885,638

167,366,396

3,480,758

3.48

NEW

3

2533_0.82

0.82

198

337

169,349,891

174,797,414

5,447,523

5.45

NEW

3

2533_0.82

0.82

198

337

176,420,939

179,903,623

3,482,684

3.48

NEW

3

2533_0.82

0.82

198

337

209,803,232

212,409,397

2,606,165

2.61

NEW

3

2533_0.82

0.82

198

337

215,824,850

219,669,971

3,845,121

3.85

NEW

3

3925_0.72

0.72

3

42

246,084,752

251,368,334

5,283,582

5.28

NEW

3

2700_0.81

0.81

34

51

280,219,748

282,070,496

1,850,748

1.85

NEW

3

2533_0.82

0.82

198

337

303,715,842

309,745,288

6,029,446

6.03

NEW

3

2183_0.84

0.84

169

342

321,054,349

322,347,291

1,292,942

1.29

NEW

3

2183_0.84

0.84

169

342

344,380,481

345,522,786

1,142,305

1.14

NEW

3

2320_0.83

0.83

185

260

380,107,019

381,564,291

1,457,272

1.46

NEW

3

2183_0.84

0.84

169

342

384,324,897

385,372,688

1,047,791

1.05

NEW

3

2183_0.84

0.84

169

342

385,585,248

387,315,713

1,730,465

1.73

NEW

3

2183_0.84

0.84

169

342

442,489,714

443,498,522

1,008,808

1.01

NEW

3

2183_0.84

0.84

169

342

443,555,174

445,661,410

2,106,236

2.11

NEW

3

2183_0.84

0.84

169

342

446,376,561

447,535,850

1,159,289

1.16

NEW

3

2183_0.84

0.84

169

342

467,722,457

470,726,943

3,004,486

3.00

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "ldna", "clusters_AUT_NEW_1Mb.docx"))

We can create a facet plot now for all chromosomes

# To plot only clusters equal or bigger than 1Mb
albo2 <- table1 |> dplyr::filter(Size >= 1000000) 

# Function to format numbers as Mb
label_mb <- function(x) {
  sprintf("%.0fMb", x / 1e6)
}

# Plot it
ggplot(albo2, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")  # Adjust the unit and number to increase space as needed
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed") +
  guides(fill = "none")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_fixed.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

Make each plot with different scales

# Plot it
ggplot(albo2, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    panel.grid.minor.x = element_blank(),
    # strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_wrap(~ Population + Chromosome, scales = "free_y", ncol = 3) +  # Free y scale, and each chromosome gets its own row
  guides(fill = "none")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_free.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

We can also remove the large clusters and plot with fixed scale to look at the cluster between 1 and 10Mb

# To plot only clusters equal or bigger than 1Mb but less that 10Mb
albo3 <- albo2 |> dplyr::filter(Size >= 1000000, Size <= 10000000)


# Plot it
ggplot(albo3, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")  # Adjust the unit and number to increase space as needed
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed") +
  guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_fixed_1_to_10Mb.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

It seems that the linkage patters are quite different from AUT to the wild population. I am not sure why we are seeing clusters at such long distance the wild population for chromosome 2 and 3

We could do the analysis by sex, but we only have 13 females and 15 males in AUT. Perhaps we could use the same number of samples for each. However, the sample size will be too small still.

7. Plot SNPs on DE genes with LD blocks

snps_scan <-
  read.table(
    # here("output", "pcadapt", "man_aut_common_SNPs_pcadapt_outflank.txt"),
    here("output", "snpeff", "SNPs_79_DE.txt"),
    stringsAsFactors = FALSE
  ) |>
  dplyr::rename(
    SNP = V1
  )

head(snps_scan)
##            SNP
## 1 AX-583054970
## 2 AX-583093532
## 3 AX-583142560
## 4 AX-583237406
## 5 AX-583279927
## 6 AX-583320377

Import bim file

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
snps <- import_bim(here("output", "ldna", "files", "file1.bim"))

# Check it
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-581444870     0    97856 C       T      
## 2 1        AX-583035083     0   305518 A       G      
## 3 1        AX-583035102     0   308124 A       G      
## 4 1        AX-583033342     0   315059 C       G      
## 5 1        AX-583035163     0   315386 A       G      
## 6 1        AX-583033356     0   315674 C       T

Merge the objects

merged_snps <- inner_join(snps_scan, snps, by = "SNP") |>
  dplyr::rename(
    Chromosome = Scaffold
  )
head(merged_snps)
##            SNP Chromosome Cm Position Allele1 Allele2
## 1 AX-583054970          1  0  5342242       G       A
## 2 AX-583093532          1  0 18373267       C       T
## 3 AX-583142560          1  0 33012482       T       C
## 4 AX-583237406          1  0 66299838       T       C
## 5 AX-583279927          1  0 80133483       A       G
## 6 AX-583320377          1  0 93313407       G       A

We cann add them as lines in the plot

# First, create a new data frame with 1Mb windows and count SNPs in each window
merged_snps$Window <- floor(merged_snps$Position / 20e6) * 20e6 # 20mb
snp_counts <- merged_snps %>%
  group_by(Chromosome, Window) %>%
  summarize(SNP_count = n(), .groups = 'drop')

# Now, add this SNP information to the plot
snp_plot <- ggplot() +
  geom_rect(data = albo2, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size, fill = as.factor(Cluster)), 
            color = "black", linewidth = 0.2) +
  geom_vline(data = snp_counts, aes(xintercept = Window + 0.5e6), color = "pink", linetype = "dotted", linewidth = 0.5) +
  geom_text(data = snp_counts, aes(x = Window + 0.5e6, y = 8e6, label = SNP_count), vjust = -0.5, color = "blue", size = 2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position (Mb)", y = "Cluster Size (Mb)") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotdash"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed")

# Display the plot
print(snp_plot)
## Warning: Combining variables of class <numeric> and <character> was deprecated in
## ggplot2 3.4.0.
## ℹ Please ensure your variables are compatible before plotting (location:
##   `combine_vars()`)
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "linkage_snps_genes_de.pdf"),
  device = "pdf",
  width = 8,
  height = 5,
  units = "in"
)

Find the non-overlapping clusters

# Split the data into two subsets based on Population
albo_NEW <- subset(albo2, Population == "NEW")
albo_AUT <- subset(albo2, Population == "AUT")

# Function to find non-overlapping clusters
find_non_overlapping_clusters <- function(new_clusters, aut_clusters) {
  non_overlapping <- vector("list", length(new_clusters$Chromosome))
  names(non_overlapping) <- unique(new_clusters$Chromosome)
  
  for (chr in unique(new_clusters$Chromosome)) {
    new_chrom_clusters <- new_clusters[new_clusters$Chromosome == chr, ]
    aut_chrom_clusters <- aut_clusters[aut_clusters$Chromosome == chr, ]
    
    non_overlap <- vector("list", nrow(new_chrom_clusters))
    
    for (i in 1:nrow(new_chrom_clusters)) {
      overlaps <- any(aut_chrom_clusters$Start <= new_chrom_clusters$End[i] & aut_chrom_clusters$End >= new_chrom_clusters$Start[i])
      if (!overlaps) {
        non_overlap[[i]] <- new_chrom_clusters[i, ]
      }
    }
    
    non_overlapping[[chr]] <- do.call(rbind, non_overlap)
  }
  
  do.call(rbind, non_overlapping)
}

# Find non-overlapping clusters
non_overlapping_clusters <- find_non_overlapping_clusters(albo_NEW, albo_AUT)

# View the non-overlapping clusters
non_overlapping_clusters
##       Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 1.29         NEW          1 1554_0.86 0.86        41    50  11618085  16238366
## 1.30         NEW          1 1788_0.84 0.84       266   353  46848250  49120885
## 1.31         NEW          1 1788_0.84 0.84       266   353  75613715  77269296
## 1.32         NEW          1 1560_0.86 0.86        93   109  88407493  90438527
## 1.33         NEW          1 1788_0.84 0.84       266   353  93547558  95297714
## 1.34         NEW          1 1782_0.84 0.84        93   119 105482568 107443193
## 1.35         NEW          1 1317_0.88 0.88        17    22 169179646 170784441
## 1.36         NEW          1 1782_0.84 0.84        93   119 208024693 209090551
## 1.37         NEW          1 1146_0.89 0.89        62    81 226898202 228109207
## 2.38         NEW          2 2543_0.84 0.84       372   674  17612329  18832351
## 2.39         NEW          2 2748_0.83 0.83       161   208  36257115  37270860
## 2.40         NEW          2 2543_0.84 0.84       372   674  61309812  63385698
## 2.41         NEW          2 2545_0.84 0.84       351   633  82822354  84538637
## 2.42         NEW          2 2545_0.84 0.84       351   633 129240899 132723328
## 2.43         NEW          2 2545_0.84 0.84       351   633 136289009 137527090
## 2.44         NEW          2 2624_0.84 0.84        15    37 159391505 160872568
## 2.45         NEW          2 2624_0.84 0.84        15    37 160918536 164996020
## 2.46         NEW          2 2748_0.83 0.83       161   208 192585621 193792689
## 2.47         NEW          2 2713_0.83 0.83        34    39 200640262 202278042
## 2.48         NEW          2 3951_0.75 0.75        38    62 203165026 204246205
## 2.49         NEW          2 2748_0.83 0.83       161   208 229785687 232287067
## 2.50         NEW          2 2748_0.83 0.83       161   208 232970945 234100828
## 2.51         NEW          2 2545_0.84 0.84       351   633 247327976 248553203
## 2.52         NEW          2 3426_0.79 0.79       240   326 263127511 264361900
## 2.53         NEW          2 2545_0.84 0.84       351   633 267169541 269283227
## 2.54         NEW          2 2545_0.84 0.84       351   633 280710015 283869411
## 2.55         NEW          2 2545_0.84 0.84       351   633 300659500 304360746
## 2.56         NEW          2 3951_0.75 0.75        38    62 306590966 308633302
## 2.58         NEW          2 2748_0.83 0.83       161   208 343930528 351760687
## 2.59         NEW          2 2995_0.82 0.82         4    39 365887578 368253627
## 2.60         NEW          2 2545_0.84 0.84       351   633 370117893 375162415
## 2.62         NEW          2 2545_0.84 0.84       351   633 416477946 420091640
## 2.63         NEW          2 2545_0.84 0.84       351   633 423480817 425459834
## 2.64         NEW          2 2545_0.84 0.84       351   633 433356850 434926662
## 2.65         NEW          2 2543_0.84 0.84       372   674 464468067 465572729
## 2.66         NEW          2 3426_0.79 0.79       240   326 475606473 477615525
## 2.67         NEW          2 2543_0.84 0.84       372   674 480159534 481378592
## 2.68         NEW          2 2543_0.84 0.84       372   674 483395086 484674520
## 2.69         NEW          2 2543_0.84 0.84       372   674 489711160 491614193
## 2.70         NEW          2 2545_0.84 0.84       351   633 514915553 516163725
## 2.71         NEW          2 2543_0.84 0.84       372   674 520801817 522038684
## 2.72         NEW          2 2545_0.84 0.84       351   633 540102890 541125553
## 2.73         NEW          2 2543_0.84 0.84       372   674 549401789 550492268
## 2.74         NEW          2 3426_0.79 0.79       240   326 551614176 553072100
## 2.75         NEW          2 2544_0.84 0.84        52    79 584580570 585804603
## 3.76         NEW          3 2183_0.84 0.84       169   342   6215232   8280336
## 3.77         NEW          3 1916_0.86 0.86        43    66   8798971  11276541
## 3.79         NEW          3 2533_0.82 0.82       198   337  46782114  48486026
## 3.80         NEW          3 2533_0.82 0.82       198   337  76622043  77953224
## 3.81         NEW          3 2183_0.84 0.84       169   342  88286681  89538915
## 3.82         NEW          3 2533_0.82 0.82       198   337  96387529  97703905
## 3.83         NEW          3 2533_0.82 0.82       198   337 103978542 105140618
## 3.84         NEW          3 2320_0.83 0.83       185   260 108099773 109990109
## 3.85         NEW          3 2533_0.82 0.82       198   337 127896703 133213688
## 3.86         NEW          3 2700_0.81 0.81        34    51 133371924 134578256
## 3.88         NEW          3 2533_0.82 0.82       198   337 155504549 159797863
## 3.89         NEW          3 2533_0.82 0.82       198   337 162731902 163859867
## 3.90         NEW          3 2533_0.82 0.82       198   337 163885638 167366396
## 3.91         NEW          3 2533_0.82 0.82       198   337 169349891 174797414
## 3.92         NEW          3 2533_0.82 0.82       198   337 176420939 179903623
## 3.93         NEW          3 2533_0.82 0.82       198   337 209803232 212409397
## 3.95         NEW          3 3925_0.72 0.72         3    42 246084752 251368334
## 3.96         NEW          3 2700_0.81 0.81        34    51 280219748 282070496
## 3.97         NEW          3 2533_0.82 0.82       198   337 303715842 309745288
## 3.98         NEW          3 2183_0.84 0.84       169   342 321054349 322347291
## 3.99         NEW          3 2183_0.84 0.84       169   342 344380481 345522786
## 3.100        NEW          3 2320_0.83 0.83       185   260 380107019 381564291
## 3.101        NEW          3 2183_0.84 0.84       169   342 384324897 385372688
## 3.102        NEW          3 2183_0.84 0.84       169   342 385585248 387315713
## 3.103        NEW          3 2183_0.84 0.84       169   342 442489714 443498522
## 3.104        NEW          3 2183_0.84 0.84       169   342 443555174 445661410
## 3.105        NEW          3 2183_0.84 0.84       169   342 446376561 447535850
## 3.106        NEW          3 2183_0.84 0.84       169   342 467722457 470726943
##          Size Size (Mb)
## 1.29  4620281      4.62
## 1.30  2272635      2.27
## 1.31  1655581      1.66
## 1.32  2031034      2.03
## 1.33  1750156      1.75
## 1.34  1960625      1.96
## 1.35  1604795      1.60
## 1.36  1065858      1.07
## 1.37  1211005      1.21
## 2.38  1220022      1.22
## 2.39  1013745      1.01
## 2.40  2075886      2.08
## 2.41  1716283      1.72
## 2.42  3482429      3.48
## 2.43  1238081      1.24
## 2.44  1481063      1.48
## 2.45  4077484      4.08
## 2.46  1207068      1.21
## 2.47  1637780      1.64
## 2.48  1081179      1.08
## 2.49  2501380      2.50
## 2.50  1129883      1.13
## 2.51  1225227      1.23
## 2.52  1234389      1.23
## 2.53  2113686      2.11
## 2.54  3159396      3.16
## 2.55  3701246      3.70
## 2.56  2042336      2.04
## 2.58  7830159      7.83
## 2.59  2366049      2.37
## 2.60  5044522      5.04
## 2.62  3613694      3.61
## 2.63  1979017      1.98
## 2.64  1569812      1.57
## 2.65  1104662      1.10
## 2.66  2009052      2.01
## 2.67  1219058      1.22
## 2.68  1279434      1.28
## 2.69  1903033      1.90
## 2.70  1248172      1.25
## 2.71  1236867      1.24
## 2.72  1022663      1.02
## 2.73  1090479      1.09
## 2.74  1457924      1.46
## 2.75  1224033      1.22
## 3.76  2065104      2.07
## 3.77  2477570      2.48
## 3.79  1703912      1.70
## 3.80  1331181      1.33
## 3.81  1252234      1.25
## 3.82  1316376      1.32
## 3.83  1162076      1.16
## 3.84  1890336      1.89
## 3.85  5316985      5.32
## 3.86  1206332      1.21
## 3.88  4293314      4.29
## 3.89  1127965      1.13
## 3.90  3480758      3.48
## 3.91  5447523      5.45
## 3.92  3482684      3.48
## 3.93  2606165      2.61
## 3.95  5283582      5.28
## 3.96  1850748      1.85
## 3.97  6029446      6.03
## 3.98  1292942      1.29
## 3.99  1142305      1.14
## 3.100 1457272      1.46
## 3.101 1047791      1.05
## 3.102 1730465      1.73
## 3.103 1008808      1.01
## 3.104 2106236      2.11
## 3.105 1159289      1.16
## 3.106 3004486      3.00

Sanity check: Second way to get it

# Function to find non-overlapping segments
find_non_overlapping_segments <- function(new_clusters, aut_clusters) {
  non_overlapping <- list()
  
  for (chr in unique(new_clusters$Chromosome)) {
    new_chrom_segments <- new_clusters[new_clusters$Chromosome == chr, ]
    aut_chrom_segments <- aut_clusters[aut_clusters$Chromosome == chr, ]
    
    non_overlap <- list()
    
    for (i in 1:nrow(new_chrom_segments)) {
      overlaps <- FALSE
      for (j in 1:nrow(aut_chrom_segments)) {
        if (new_chrom_segments$Start[i] <= aut_chrom_segments$End[j] && new_chrom_segments$End[i] >= aut_chrom_segments$Start[j]) {
          overlaps <- TRUE
          break
        }
      }
      
      if (!overlaps) {
        non_overlap <- c(non_overlap, list(new_chrom_segments[i, ]))
      }
    }
    
    non_overlapping[[chr]] <- do.call(rbind, non_overlap)
  }
  
  do.call(rbind, non_overlapping)
}

# Find non-overlapping segments
non_overlapping_segments <- find_non_overlapping_segments(albo_NEW, albo_AUT)

# View the non-overlapping segments
non_overlapping_segments
##     Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 29         NEW          1 1554_0.86 0.86        41    50  11618085  16238366
## 30         NEW          1 1788_0.84 0.84       266   353  46848250  49120885
## 31         NEW          1 1788_0.84 0.84       266   353  75613715  77269296
## 32         NEW          1 1560_0.86 0.86        93   109  88407493  90438527
## 33         NEW          1 1788_0.84 0.84       266   353  93547558  95297714
## 34         NEW          1 1782_0.84 0.84        93   119 105482568 107443193
## 35         NEW          1 1317_0.88 0.88        17    22 169179646 170784441
## 36         NEW          1 1782_0.84 0.84        93   119 208024693 209090551
## 37         NEW          1 1146_0.89 0.89        62    81 226898202 228109207
## 38         NEW          2 2543_0.84 0.84       372   674  17612329  18832351
## 39         NEW          2 2748_0.83 0.83       161   208  36257115  37270860
## 40         NEW          2 2543_0.84 0.84       372   674  61309812  63385698
## 41         NEW          2 2545_0.84 0.84       351   633  82822354  84538637
## 42         NEW          2 2545_0.84 0.84       351   633 129240899 132723328
## 43         NEW          2 2545_0.84 0.84       351   633 136289009 137527090
## 44         NEW          2 2624_0.84 0.84        15    37 159391505 160872568
## 45         NEW          2 2624_0.84 0.84        15    37 160918536 164996020
## 46         NEW          2 2748_0.83 0.83       161   208 192585621 193792689
## 47         NEW          2 2713_0.83 0.83        34    39 200640262 202278042
## 48         NEW          2 3951_0.75 0.75        38    62 203165026 204246205
## 49         NEW          2 2748_0.83 0.83       161   208 229785687 232287067
## 50         NEW          2 2748_0.83 0.83       161   208 232970945 234100828
## 51         NEW          2 2545_0.84 0.84       351   633 247327976 248553203
## 52         NEW          2 3426_0.79 0.79       240   326 263127511 264361900
## 53         NEW          2 2545_0.84 0.84       351   633 267169541 269283227
## 54         NEW          2 2545_0.84 0.84       351   633 280710015 283869411
## 55         NEW          2 2545_0.84 0.84       351   633 300659500 304360746
## 56         NEW          2 3951_0.75 0.75        38    62 306590966 308633302
## 58         NEW          2 2748_0.83 0.83       161   208 343930528 351760687
## 59         NEW          2 2995_0.82 0.82         4    39 365887578 368253627
## 60         NEW          2 2545_0.84 0.84       351   633 370117893 375162415
## 62         NEW          2 2545_0.84 0.84       351   633 416477946 420091640
## 63         NEW          2 2545_0.84 0.84       351   633 423480817 425459834
## 64         NEW          2 2545_0.84 0.84       351   633 433356850 434926662
## 65         NEW          2 2543_0.84 0.84       372   674 464468067 465572729
## 66         NEW          2 3426_0.79 0.79       240   326 475606473 477615525
## 67         NEW          2 2543_0.84 0.84       372   674 480159534 481378592
## 68         NEW          2 2543_0.84 0.84       372   674 483395086 484674520
## 69         NEW          2 2543_0.84 0.84       372   674 489711160 491614193
## 70         NEW          2 2545_0.84 0.84       351   633 514915553 516163725
## 71         NEW          2 2543_0.84 0.84       372   674 520801817 522038684
## 72         NEW          2 2545_0.84 0.84       351   633 540102890 541125553
## 73         NEW          2 2543_0.84 0.84       372   674 549401789 550492268
## 74         NEW          2 3426_0.79 0.79       240   326 551614176 553072100
## 75         NEW          2 2544_0.84 0.84        52    79 584580570 585804603
## 76         NEW          3 2183_0.84 0.84       169   342   6215232   8280336
## 77         NEW          3 1916_0.86 0.86        43    66   8798971  11276541
## 79         NEW          3 2533_0.82 0.82       198   337  46782114  48486026
## 80         NEW          3 2533_0.82 0.82       198   337  76622043  77953224
## 81         NEW          3 2183_0.84 0.84       169   342  88286681  89538915
## 82         NEW          3 2533_0.82 0.82       198   337  96387529  97703905
## 83         NEW          3 2533_0.82 0.82       198   337 103978542 105140618
## 84         NEW          3 2320_0.83 0.83       185   260 108099773 109990109
## 85         NEW          3 2533_0.82 0.82       198   337 127896703 133213688
## 86         NEW          3 2700_0.81 0.81        34    51 133371924 134578256
## 88         NEW          3 2533_0.82 0.82       198   337 155504549 159797863
## 89         NEW          3 2533_0.82 0.82       198   337 162731902 163859867
## 90         NEW          3 2533_0.82 0.82       198   337 163885638 167366396
## 91         NEW          3 2533_0.82 0.82       198   337 169349891 174797414
## 92         NEW          3 2533_0.82 0.82       198   337 176420939 179903623
## 93         NEW          3 2533_0.82 0.82       198   337 209803232 212409397
## 95         NEW          3 3925_0.72 0.72         3    42 246084752 251368334
## 96         NEW          3 2700_0.81 0.81        34    51 280219748 282070496
## 97         NEW          3 2533_0.82 0.82       198   337 303715842 309745288
## 98         NEW          3 2183_0.84 0.84       169   342 321054349 322347291
## 99         NEW          3 2183_0.84 0.84       169   342 344380481 345522786
## 100        NEW          3 2320_0.83 0.83       185   260 380107019 381564291
## 101        NEW          3 2183_0.84 0.84       169   342 384324897 385372688
## 102        NEW          3 2183_0.84 0.84       169   342 385585248 387315713
## 103        NEW          3 2183_0.84 0.84       169   342 442489714 443498522
## 104        NEW          3 2183_0.84 0.84       169   342 443555174 445661410
## 105        NEW          3 2183_0.84 0.84       169   342 446376561 447535850
## 106        NEW          3 2183_0.84 0.84       169   342 467722457 470726943
##        Size Size (Mb)
## 29  4620281      4.62
## 30  2272635      2.27
## 31  1655581      1.66
## 32  2031034      2.03
## 33  1750156      1.75
## 34  1960625      1.96
## 35  1604795      1.60
## 36  1065858      1.07
## 37  1211005      1.21
## 38  1220022      1.22
## 39  1013745      1.01
## 40  2075886      2.08
## 41  1716283      1.72
## 42  3482429      3.48
## 43  1238081      1.24
## 44  1481063      1.48
## 45  4077484      4.08
## 46  1207068      1.21
## 47  1637780      1.64
## 48  1081179      1.08
## 49  2501380      2.50
## 50  1129883      1.13
## 51  1225227      1.23
## 52  1234389      1.23
## 53  2113686      2.11
## 54  3159396      3.16
## 55  3701246      3.70
## 56  2042336      2.04
## 58  7830159      7.83
## 59  2366049      2.37
## 60  5044522      5.04
## 62  3613694      3.61
## 63  1979017      1.98
## 64  1569812      1.57
## 65  1104662      1.10
## 66  2009052      2.01
## 67  1219058      1.22
## 68  1279434      1.28
## 69  1903033      1.90
## 70  1248172      1.25
## 71  1236867      1.24
## 72  1022663      1.02
## 73  1090479      1.09
## 74  1457924      1.46
## 75  1224033      1.22
## 76  2065104      2.07
## 77  2477570      2.48
## 79  1703912      1.70
## 80  1331181      1.33
## 81  1252234      1.25
## 82  1316376      1.32
## 83  1162076      1.16
## 84  1890336      1.89
## 85  5316985      5.32
## 86  1206332      1.21
## 88  4293314      4.29
## 89  1127965      1.13
## 90  3480758      3.48
## 91  5447523      5.45
## 92  3482684      3.48
## 93  2606165      2.61
## 95  5283582      5.28
## 96  1850748      1.85
## 97  6029446      6.03
## 98  1292942      1.29
## 99  1142305      1.14
## 100 1457272      1.46
## 101 1047791      1.05
## 102 1730465      1.73
## 103 1008808      1.01
## 104 2106236      2.11
## 105 1159289      1.16
## 106 3004486      3.00
# Function to find unique non-overlapping clusters for each population
find_unique_clusters <- function(population1, population2) {
  unique_clusters <- list()
  
  for (chr in unique(population1$Chromosome)) {
    pop1_chrom_clusters <- population1[population1$Chromosome == chr, ]
    pop2_chrom_clusters <- population2[population2$Chromosome == chr, ]
    
    unique_clusters_chr <- list()
    
    for (i in 1:nrow(pop1_chrom_clusters)) {
      overlaps <- any(
        pop2_chrom_clusters$Start <= pop1_chrom_clusters$End[i] & 
        pop2_chrom_clusters$End >= pop1_chrom_clusters$Start[i]
      )
      
      if (!overlaps) {
        unique_clusters_chr <- c(unique_clusters_chr, list(pop1_chrom_clusters[i, ]))
      }
    }
    
    if (length(unique_clusters_chr) > 0) {
      unique_clusters[[chr]] <- do.call(rbind, unique_clusters_chr)
    }
  }
  
  do.call(rbind, unique_clusters)
}

# Find unique non-overlapping clusters for NEW and AUT populations
unique_clusters_NEW <- find_unique_clusters(albo2[albo2$Population == "NEW",], albo2[albo2$Population == "AUT",])
unique_clusters_AUT <- find_unique_clusters(albo2[albo2$Population == "AUT",], albo2[albo2$Population == "NEW",])

# View the unique non-overlapping clusters for each population
unique_clusters_NEW
##     Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 29         NEW          1 1554_0.86 0.86        41    50  11618085  16238366
## 30         NEW          1 1788_0.84 0.84       266   353  46848250  49120885
## 31         NEW          1 1788_0.84 0.84       266   353  75613715  77269296
## 32         NEW          1 1560_0.86 0.86        93   109  88407493  90438527
## 33         NEW          1 1788_0.84 0.84       266   353  93547558  95297714
## 34         NEW          1 1782_0.84 0.84        93   119 105482568 107443193
## 35         NEW          1 1317_0.88 0.88        17    22 169179646 170784441
## 36         NEW          1 1782_0.84 0.84        93   119 208024693 209090551
## 37         NEW          1 1146_0.89 0.89        62    81 226898202 228109207
## 38         NEW          2 2543_0.84 0.84       372   674  17612329  18832351
## 39         NEW          2 2748_0.83 0.83       161   208  36257115  37270860
## 40         NEW          2 2543_0.84 0.84       372   674  61309812  63385698
## 41         NEW          2 2545_0.84 0.84       351   633  82822354  84538637
## 42         NEW          2 2545_0.84 0.84       351   633 129240899 132723328
## 43         NEW          2 2545_0.84 0.84       351   633 136289009 137527090
## 44         NEW          2 2624_0.84 0.84        15    37 159391505 160872568
## 45         NEW          2 2624_0.84 0.84        15    37 160918536 164996020
## 46         NEW          2 2748_0.83 0.83       161   208 192585621 193792689
## 47         NEW          2 2713_0.83 0.83        34    39 200640262 202278042
## 48         NEW          2 3951_0.75 0.75        38    62 203165026 204246205
## 49         NEW          2 2748_0.83 0.83       161   208 229785687 232287067
## 50         NEW          2 2748_0.83 0.83       161   208 232970945 234100828
## 51         NEW          2 2545_0.84 0.84       351   633 247327976 248553203
## 52         NEW          2 3426_0.79 0.79       240   326 263127511 264361900
## 53         NEW          2 2545_0.84 0.84       351   633 267169541 269283227
## 54         NEW          2 2545_0.84 0.84       351   633 280710015 283869411
## 55         NEW          2 2545_0.84 0.84       351   633 300659500 304360746
## 56         NEW          2 3951_0.75 0.75        38    62 306590966 308633302
## 58         NEW          2 2748_0.83 0.83       161   208 343930528 351760687
## 59         NEW          2 2995_0.82 0.82         4    39 365887578 368253627
## 60         NEW          2 2545_0.84 0.84       351   633 370117893 375162415
## 62         NEW          2 2545_0.84 0.84       351   633 416477946 420091640
## 63         NEW          2 2545_0.84 0.84       351   633 423480817 425459834
## 64         NEW          2 2545_0.84 0.84       351   633 433356850 434926662
## 65         NEW          2 2543_0.84 0.84       372   674 464468067 465572729
## 66         NEW          2 3426_0.79 0.79       240   326 475606473 477615525
## 67         NEW          2 2543_0.84 0.84       372   674 480159534 481378592
## 68         NEW          2 2543_0.84 0.84       372   674 483395086 484674520
## 69         NEW          2 2543_0.84 0.84       372   674 489711160 491614193
## 70         NEW          2 2545_0.84 0.84       351   633 514915553 516163725
## 71         NEW          2 2543_0.84 0.84       372   674 520801817 522038684
## 72         NEW          2 2545_0.84 0.84       351   633 540102890 541125553
## 73         NEW          2 2543_0.84 0.84       372   674 549401789 550492268
## 74         NEW          2 3426_0.79 0.79       240   326 551614176 553072100
## 75         NEW          2 2544_0.84 0.84        52    79 584580570 585804603
## 76         NEW          3 2183_0.84 0.84       169   342   6215232   8280336
## 77         NEW          3 1916_0.86 0.86        43    66   8798971  11276541
## 79         NEW          3 2533_0.82 0.82       198   337  46782114  48486026
## 80         NEW          3 2533_0.82 0.82       198   337  76622043  77953224
## 81         NEW          3 2183_0.84 0.84       169   342  88286681  89538915
## 82         NEW          3 2533_0.82 0.82       198   337  96387529  97703905
## 83         NEW          3 2533_0.82 0.82       198   337 103978542 105140618
## 84         NEW          3 2320_0.83 0.83       185   260 108099773 109990109
## 85         NEW          3 2533_0.82 0.82       198   337 127896703 133213688
## 86         NEW          3 2700_0.81 0.81        34    51 133371924 134578256
## 88         NEW          3 2533_0.82 0.82       198   337 155504549 159797863
## 89         NEW          3 2533_0.82 0.82       198   337 162731902 163859867
## 90         NEW          3 2533_0.82 0.82       198   337 163885638 167366396
## 91         NEW          3 2533_0.82 0.82       198   337 169349891 174797414
## 92         NEW          3 2533_0.82 0.82       198   337 176420939 179903623
## 93         NEW          3 2533_0.82 0.82       198   337 209803232 212409397
## 95         NEW          3 3925_0.72 0.72         3    42 246084752 251368334
## 96         NEW          3 2700_0.81 0.81        34    51 280219748 282070496
## 97         NEW          3 2533_0.82 0.82       198   337 303715842 309745288
## 98         NEW          3 2183_0.84 0.84       169   342 321054349 322347291
## 99         NEW          3 2183_0.84 0.84       169   342 344380481 345522786
## 100        NEW          3 2320_0.83 0.83       185   260 380107019 381564291
## 101        NEW          3 2183_0.84 0.84       169   342 384324897 385372688
## 102        NEW          3 2183_0.84 0.84       169   342 385585248 387315713
## 103        NEW          3 2183_0.84 0.84       169   342 442489714 443498522
## 104        NEW          3 2183_0.84 0.84       169   342 443555174 445661410
## 105        NEW          3 2183_0.84 0.84       169   342 446376561 447535850
## 106        NEW          3 2183_0.84 0.84       169   342 467722457 470726943
##        Size Size (Mb)
## 29  4620281      4.62
## 30  2272635      2.27
## 31  1655581      1.66
## 32  2031034      2.03
## 33  1750156      1.75
## 34  1960625      1.96
## 35  1604795      1.60
## 36  1065858      1.07
## 37  1211005      1.21
## 38  1220022      1.22
## 39  1013745      1.01
## 40  2075886      2.08
## 41  1716283      1.72
## 42  3482429      3.48
## 43  1238081      1.24
## 44  1481063      1.48
## 45  4077484      4.08
## 46  1207068      1.21
## 47  1637780      1.64
## 48  1081179      1.08
## 49  2501380      2.50
## 50  1129883      1.13
## 51  1225227      1.23
## 52  1234389      1.23
## 53  2113686      2.11
## 54  3159396      3.16
## 55  3701246      3.70
## 56  2042336      2.04
## 58  7830159      7.83
## 59  2366049      2.37
## 60  5044522      5.04
## 62  3613694      3.61
## 63  1979017      1.98
## 64  1569812      1.57
## 65  1104662      1.10
## 66  2009052      2.01
## 67  1219058      1.22
## 68  1279434      1.28
## 69  1903033      1.90
## 70  1248172      1.25
## 71  1236867      1.24
## 72  1022663      1.02
## 73  1090479      1.09
## 74  1457924      1.46
## 75  1224033      1.22
## 76  2065104      2.07
## 77  2477570      2.48
## 79  1703912      1.70
## 80  1331181      1.33
## 81  1252234      1.25
## 82  1316376      1.32
## 83  1162076      1.16
## 84  1890336      1.89
## 85  5316985      5.32
## 86  1206332      1.21
## 88  4293314      4.29
## 89  1127965      1.13
## 90  3480758      3.48
## 91  5447523      5.45
## 92  3482684      3.48
## 93  2606165      2.61
## 95  5283582      5.28
## 96  1850748      1.85
## 97  6029446      6.03
## 98  1292942      1.29
## 99  1142305      1.14
## 100 1457272      1.46
## 101 1047791      1.05
## 102 1730465      1.73
## 103 1008808      1.01
## 104 2106236      2.11
## 105 1159289      1.16
## 106 3004486      3.00
unique_clusters_AUT
##    Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 1         AUT          1  1972_0.6  0.6        20   124    442875   2339987
## 2         AUT          1 1844_0.71 0.71        20    45   5443736  10957198
## 3         AUT          1 1777_0.76 0.76        45    61  87157392  88231134
## 4         AUT          1 1693_0.79 0.79        25    27 102144753 103394842
## 5         AUT          1 1596_0.82 0.82        21    50 119130691 120305920
## 6         AUT          1 1433_0.85 0.85        18    33 131161343 132428189
## 7         AUT          1 1942_0.64 0.64        22    88 138373561 140570398
## 8         AUT          1 1528_0.84 0.84        28    38 171598966 172728015
## 9         AUT          1 1528_0.84 0.84        28    38 188134463 189136637
## 10        AUT          1  1124_0.9  0.9       211   248 260581858 262016624
## 11        AUT          1 1948_0.63 0.63        21    90 266160984 270116064
## 12        AUT          1  1972_0.6  0.6        20   124 276166620 280693512
## 13        AUT          1  1972_0.6  0.6        20   124 281430183 285327100
## 14        AUT          2 2185_0.89 0.89       190   213 123252079 124310386
## 15        AUT          2 2180_0.89 0.89       394   479 149078628 151226909
## 17        AUT          2  3457_0.6  0.6        30    66 328800255 329979871
## 18        AUT          2 3139_0.78 0.78        70   316 390518548 394323260
## 19        AUT          2 3139_0.78 0.78        70   316 395917900 402963765
## 22        AUT          3 2465_0.87 0.87       422   507  23447984  24876231
## 25        AUT          3 2807_0.83 0.83        81   213 199465149 200641223
## 26        AUT          3 2807_0.83 0.83        81   213 201927576 204102298
## 27        AUT          3 3147_0.73 0.73        47    89 214452458 215749074
##       Size Size (Mb)
## 1  1897112      1.90
## 2  5513462      5.51
## 3  1073742      1.07
## 4  1250089      1.25
## 5  1175229      1.18
## 6  1266846      1.27
## 7  2196837      2.20
## 8  1129049      1.13
## 9  1002174      1.00
## 10 1434766      1.43
## 11 3955080      3.96
## 12 4526892      4.53
## 13 3896917      3.90
## 14 1058307      1.06
## 15 2148281      2.15
## 17 1179616      1.18
## 18 3804712      3.80
## 19 7045865      7.05
## 22 1428247      1.43
## 25 1176074      1.18
## 26 2174722      2.17
## 27 1296616      1.30

Plot it

# Combine the two data frames
combined_clusters <- rbind(unique_clusters_NEW, unique_clusters_AUT)


# Function to format numbers as Mb
label_mb <- function(x) {
  sprintf("%.0fMb", x / 1e6)
}

# Plot it
ggplot(combined_clusters, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")  # Adjust the unit and number to increase space as needed
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed") +
  guides(fill = "none")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_fixed_non_overlapping.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)
snps_DEgenes<- read_delim(here("output", "snpeff", "SNPs_on_DE_genes.txt"), delim = "\t", col_names = TRUE, show_col_types = FALSE)
head(snps_DEgenes)
## # A tibble: 6 × 3
##   SNP          Chromosome Position_chr
##   <chr>             <dbl>        <dbl>
## 1 AX-583050970          1      3777633
## 2 AX-583052853          1      3777864
## 3 AX-583051011          1      3779183
## 4 AX-583052887          1      3793314
## 5 AX-583051025          1      3793653
## 6 AX-583052920          1      3794185

Check if any SNP fall within the clusters

# Initialize an empty data frame to store the results
results <- data.frame(Population = character(), Chromosome = integer(), 
                      Cluster = character(), r2 = numeric(), 
                      nSegments = integer(), nSNPs = integer(), 
                      Start = integer(), End = integer(), Size = integer(), 
                      SizeMb = numeric(), SNP = character(), Position_chr = integer(), 
                      stringsAsFactors = FALSE)

for (i in 1:nrow(snps_DEgenes)) {
    for (j in 1:nrow(combined_clusters)) {
        if (snps_DEgenes$Chromosome[i] == combined_clusters$Chromosome[j] &&
            snps_DEgenes$Position_chr[i] >= combined_clusters$Start[j] &&
            snps_DEgenes$Position_chr[i] <= combined_clusters$End[j]) {

            # Create a new row as a data frame with the same column names
            new_row <- data.frame(Population = combined_clusters$Population[j], 
                                  Chromosome = combined_clusters$Chromosome[j], 
                                  Cluster = combined_clusters$Cluster[j], 
                                  r2 = combined_clusters$r2[j], 
                                  nSegments = combined_clusters$nSegments[j], 
                                  nSNPs = combined_clusters$nSNPs[j], 
                                  Start = combined_clusters$Start[j], 
                                  End = combined_clusters$End[j], 
                                  Size = combined_clusters$Size[j], 
                                  SizeMb = combined_clusters$Size[j], 
                                  SNP = snps_DEgenes$SNP[i], 
                                  Position_chr = snps_DEgenes$Position_chr[i], 
                                  stringsAsFactors = FALSE)

            # Append the new row to the results data frame
            results <- rbind(results, new_row)
        }
    }
}


head(results)
##   Population Chromosome   Cluster   r2 nSegments nSNPs   Start      End    Size
## 1        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
## 2        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
## 3        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
## 4        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
## 5        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
## 6        AUT          1 1844_0.71 0.71        20    45 5443736 10957198 5513462
##    SizeMb          SNP Position_chr
## 1 5513462 AX-583055283      5443736
## 2 5513462 AX-583055363      5445016
## 3 5513462 AX-583055828      6260504
## 4 5513462 AX-583055939      6279591
## 5 5513462 AX-583055991      6280255
## 6 5513462 AX-583054215      6421768

We can count how many SNPs we have per cluster per population

# Use aggregate to count the number of SNPs per unique combination of Population, Chromosome, and Cluster
snp_counts <- aggregate(SNP ~ Population + Chromosome + Cluster, data = results, FUN = length)

# Rename the SNP column to reflect that it now contains counts
colnames(snp_counts)[which(colnames(snp_counts) == "SNP")] <- "SNP_Count"

# Check it
head(snp_counts)
##   Population Chromosome   Cluster SNP_Count
## 1        NEW          1 1146_0.89         9
## 2        AUT          1 1528_0.84         1
## 3        NEW          1 1788_0.84         1
## 4        AUT          1 1844_0.71        13
## 5        NEW          3 1916_0.86         2
## 6        AUT          1 1942_0.64         1

Plot the clusters for which we have SNPs on the DE genes

# Compute SNP counts for each cluster
snp_counts2 <- aggregate(SNP ~ Cluster, data = results, FUN = length)
colnames(snp_counts2)[2] <- "SNP_Count"

# Merge SNP counts with the results data frame
results_with_counts <- merge(results, snp_counts2, by = "Cluster")

ggplot(results_with_counts, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  geom_text(aes(x = (Start + End) / 2, y = Size + 0.5, label = SNP_Count), size = 3, vjust = -0.2) + # Annotate SNP counts on top
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed") +
  guides(fill = "none")

# Create a new categorical variable for fill aesthetic
results_with_counts$Cluster_SNP = with(results_with_counts, paste(Cluster, " (", SNP_Count, " SNPs)", sep=""))

# Updated ggplot code
ggplot(results_with_counts, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster_SNP)), color = "black", linewidth = 0.2) +
  scale_fill_discrete(name = "Cluster and SNP Count") +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "right",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed")

We cab add a number to identify each cluster

# Assign unique numbers to each cluster and create labels
results_with_counts <- results_with_counts %>%
  mutate(Cluster_Number = as.numeric(as.factor(Cluster))) %>%
  arrange(Cluster_Number) %>%
  mutate(Cluster_Label = paste(Cluster_Number, "_", Cluster, " (", SNP_Count, ")", sep=""))

# Reorder the factor levels of Cluster_Label based on Cluster_Number
results_with_counts$Cluster_Label <- factor(results_with_counts$Cluster_Label,
                                            levels = unique(results_with_counts$Cluster_Label[order(results_with_counts$Cluster_Number)]))

Save

saveRDS(results_with_counts, file = here("output", "ldna", "results_with_counts.rds"))

Read the data

results_with_counts <- readRDS(file = here("output", "ldna", "results_with_counts.rds"))
# Plotting
ggplot(results_with_counts, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = Cluster_Label), color = "black", linewidth = 0.2) +
  geom_text(aes(x = (Start + End) / 2, y = Size + 0.5, label = Cluster_Number), size = 3, vjust = -0.2) +
  scale_fill_discrete(name = "Cluster ID") +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "right",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_non_overlapping_clusters_with_SNPs_on_DE_genes.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

Now we can get the SNPs on the DE genes

snps79_DE_genes<- read_delim(here("output", "snpeff","SNPs_79_DE.txt"), delim = "\t", col_names = FALSE, show_col_types = FALSE)
head(snps79_DE_genes)
## # A tibble: 6 × 1
##   X1          
##   <chr>       
## 1 AX-583054970
## 2 AX-583093532
## 3 AX-583142560
## 4 AX-583237406
## 5 AX-583279927
## 6 AX-583320377

Or the SNPs from the selection scan

snps_selection <- read_delim(here("output", "pcadapt","outlier_157_SNPs.txt"), delim = "\t", col_names = FALSE, show_col_types = FALSE)
head(snps_selection)
## # A tibble: 6 × 1
##   X1          
##   <chr>       
## 1 AX-583054970
## 2 AX-583095890
## 3 AX-583324654
## 4 AX-583423493
## 5 AX-583426050
## 6 AX-583467551

Filter

# Create a logical vector to filter rows based on SNP IDs
filter_vector <- results_with_counts$SNP %in% snps79_DE_genes$X1
# filter_vector <- results_with_counts$SNP %in% snps_selection$X1  # we do not have any

# Subset results_with_counts to keep only rows where SNP matches
filtered_results <- results_with_counts[filter_vector, ]

head(filtered_results)
##       Cluster Population Chromosome   r2 nSegments nSNPs     Start       End
## 57  2533_0.82        NEW          3 0.82       198   337  96387529  97703905
## 71  2533_0.82        NEW          3 0.82       198   337 103978542 105140618
## 87  2533_0.82        NEW          3 0.82       198   337 303715842 309745288
## 106 2545_0.84        NEW          2 0.84       351   633 280710015 283869411
## 122 2748_0.83        NEW          2 0.83       161   208 343930528 351760687
## 123 2748_0.83        NEW          2 0.83       161   208 343930528 351760687
##        Size  SizeMb          SNP Position_chr SNP_Count         Cluster_SNP
## 57  1316376 1316376 AX-580706309     96536458        33 2533_0.82 (33 SNPs)
## 71  1162076 1162076 AX-580734190    105029277        33 2533_0.82 (33 SNPs)
## 87  6029446 6029446 AX-581839987    307692441        33 2533_0.82 (33 SNPs)
## 106 3159396 3159396 AX-584792883    281626870        16 2545_0.84 (16 SNPs)
## 122 7830159 7830159 AX-584949353    346997659         4  2748_0.83 (4 SNPs)
## 123 7830159 7830159 AX-585454571    345234664         4  2748_0.83 (4 SNPs)
##     Cluster_Number     Cluster_Label
## 57               8  8_2533_0.82 (33)
## 71               8  8_2533_0.82 (33)
## 87               8  8_2533_0.82 (33)
## 106             10 10_2545_0.84 (16)
## 122             12  12_2748_0.83 (4)
## 123             12  12_2748_0.83 (4)
# length(filtered_results)

We have 16 SNPs

# Assign unique numbers to each cluster and create labels
filtered_results <- filtered_results %>%
  mutate(Cluster_Number = as.numeric(as.factor(Cluster))) %>%
  arrange(Cluster_Number) %>%
  mutate(Cluster_Label = paste(Cluster_Number, "_", Cluster, " (", SNP_Count, ")", sep=""))

# Reorder the factor levels of Cluster_Label based on Cluster_Number
filtered_results$Cluster_Label <- factor(filtered_results$Cluster_Label,
                                            levels = unique(filtered_results$Cluster_Label[order(filtered_results$Cluster_Number)]))

# Plotting
ggplot(filtered_results, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = Cluster_Label), color = "black", linewidth = 0.2) +
  geom_text(aes(x = (Start + End) / 2, y = Size + 0.5, label = Cluster_Number), size = 3, vjust = -0.2) +
  scale_fill_discrete(name = "Cluster ID") +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "right",
    panel.spacing.x = unit(1, "lines")
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_overlapping_clusters_with_SNPs_on_DE_genes_intersect.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

This are the clusters for which we found the SNP as outlier and the gene the SNP in on is DE

Get the overlapping clusters

# Function to find overlapping clusters
find_overlapping_clusters <- function(population1, population2) {
  overlapping_clusters <- list()
  
  for (chr in unique(population1$Chromosome)) {
    pop1_chrom_clusters <- population1[population1$Chromosome == chr, ]
    pop2_chrom_clusters <- population2[population2$Chromosome == chr, ]
    
    overlapping_clusters_chr <- list()
    
    for (i in 1:nrow(pop1_chrom_clusters)) {
      for (j in 1:nrow(pop2_chrom_clusters)) {
        if (pop1_chrom_clusters$Start[i] <= pop2_chrom_clusters$End[j] && 
            pop1_chrom_clusters$End[i] >= pop2_chrom_clusters$Start[j]) {
          overlapping_clusters_chr <- c(overlapping_clusters_chr, list(pop1_chrom_clusters[i, ]))
          break
        }
      }
    }
    
    if (length(overlapping_clusters_chr) > 0) {
      overlapping_clusters[[chr]] <- do.call(rbind, overlapping_clusters_chr)
    }
  }
  
  do.call(rbind, overlapping_clusters)
}

# Subset data for NEW and AUT populations
albo_NEW <- subset(albo2, Population == "NEW")
albo_AUT <- subset(albo2, Population == "AUT")

# Find overlapping clusters between NEW and AUT
overlapping_clusters <- find_overlapping_clusters(albo_NEW, albo_AUT)

# View the overlapping clusters
overlapping_clusters
##    Population Chromosome   Cluster   r2 nSegments nSNPs     Start       End
## 57        NEW          2 2545_0.84 0.84       351   633 321501691 323875574
## 61        NEW          2 2545_0.84 0.84       351   633 407123516 415257912
## 78        NEW          3 2533_0.82 0.82       198   337  33859167  36902893
## 87        NEW          3 2533_0.82 0.82       198   337 143504855 146770743
## 94        NEW          3 2533_0.82 0.82       198   337 215824850 219669971
##       Size Size (Mb)
## 57 2373883      2.37
## 61 8134396      8.13
## 78 3043726      3.04
## 87 3265888      3.27
## 94 3845121      3.85

Plot it

# Function to format numbers as Mb
label_mb <- function(x) {
  sprintf("%.0fMb", x / 1e6)
}

# Plot it
ggplot(overlapping_clusters, aes(xmin = Start, xmax = End, ymin = 0, ymax = Size)) +
  geom_rect(aes(fill = as.factor(Cluster)), color = "black", linewidth = 0.2) +
  scale_x_continuous(labels = label_mb, breaks = pretty_breaks(n = 3)) +
  scale_y_continuous(labels = label_mb, breaks = pretty_breaks(n = 5)) +
  labs(x = "Position", y = "Cluster Size") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 5)),
    axis.ticks.y = element_blank(),
    panel.grid.major.x = element_line(color = "gray", linetype = "dotted"),
    strip.background = element_rect(fill = "#e8e8e8", colour = NA),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    panel.spacing.x = unit(1, "lines")  # Adjust the unit and number to increase space as needed
  ) +
  facet_grid(Population ~ Chromosome, scales = "fixed", space = "fixed") +
  guides(fill = "none")

# Use ggsave to save the plot as a PDF
ggsave(
  filename = here("output", "ldna", "AUT_NEW_fixed_overlapping.pdf"),
  device = "pdf",
  width = 10,
  height = 5,
  units = "in"
)

8. Mapping back SNP clusters to scaffolds

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells 2919724 156.0    5389527  287.9         NA    5389527  287.9
## Vcells 5301811  40.5  451199133 3442.4      32768 1171951721 8941.3

I created the chromosomal scale for the linkage networks analysis, now we have to map back the SNPs from clusters of interest to the scaffold scale, then we can check the gene annotation file to find what genes are in the linkage group.

Cluster 14 is on chromosome 2 and only in the AUT line. I create files with the SNP ids, we can import it (on the legend is the cluster 14_3139_0.78 (18))

aut_ch2 <- readRDS(here("output", "ldna", "pop", "chr2", "AUT_clusters_snps.rds"))
str(aut_ch2$`3139_0.78`)
##  chr [1:566] "AX-579450486" "AX-579456609" "AX-579459157" "AX-579459504" ...
# We have 566 SNPs on this cluster

Now we can import the bim file with the scaffolds. We can use the backup file we created when we created the chromosomal scale.

head output/quality_control/file7_backup.bim
## 1.1  AX-581444870    0   97856   C   T
## 1.1  AX-583035083    0   305518  A   G
## 1.1  AX-583035102    0   308124  A   G
## 1.1  AX-583033342    0   315059  C   G
## 1.1  AX-583035163    0   315386  A   G
## 1.1  AX-583033356    0   315674  C   T
## 1.1  AX-583033370    0   330057  G   A
## 1.1  AX-583035194    0   330265  A   G
## 1.1  AX-583035198    0   330908  G   T
## 1.1  AX-583033387    0   331288  C   T

Import it

# Import the function
source(
  here(
    "scripts", "analysis", "import_bim.R")
)

# Import the data
snps <- import_bim(here("output", "quality_control", "file7_backup.bim"))

# Check it
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-581444870     0    97856 C       T      
## 2 1.1      AX-583035083     0   305518 A       G      
## 3 1.1      AX-583035102     0   308124 A       G      
## 4 1.1      AX-583033342     0   315059 C       G      
## 5 1.1      AX-583035163     0   315386 A       G      
## 6 1.1      AX-583033356     0   315674 C       T

Now we can filter the 566 SNPs that are in the cluster and see what scaffolds there are located

cluster_14 <- snps[snps$SNP %in% aut_ch2$`3139_0.78`, ]
head(cluster_14) # 316 SNPs
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 2.22     AX-579450486     0  1538827 G       A      
## 2 2.22     AX-579456609     0  2793553 T       G      
## 3 2.22     AX-579459157     0  3278309 C       T      
## 4 2.22     AX-579459504     0  3384885 C       A      
## 5 2.22     AX-579459790     0  3448373 T       C      
## 6 2.22     AX-579461404     0  3490196 C       T

Create table

# Create the flextable
my_flextable <- flextable(cluster_14)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Scaffold

SNP

Cm

Position

Allele1

Allele2

2.22

AX-579450486

0

1,538,827

G

A

2.22

AX-579456609

0

2,793,553

T

G

2.22

AX-579459157

0

3,278,309

C

T

2.22

AX-579459504

0

3,384,885

C

A

2.22

AX-579459790

0

3,448,373

T

C

2.22

AX-579461404

0

3,490,196

C

T

2.22

AX-579461838

0

3,576,474

G

A

2.22

AX-579461056

0

3,629,081

C

T

2.22

AX-579461271

0

3,653,077

C

T

2.22

AX-579462558

0

3,681,870

G

A

2.22

AX-579463030

0

3,776,496

A

G

2.22

AX-579463068

0

3,777,765

A

T

2.22

AX-579463086

0

3,791,971

A

C

2.22

AX-579463269

0

3,848,058

C

T

2.22

AX-579462195

0

3,852,455

T

C

2.22

AX-579463347

0

3,852,798

T

G

2.22

AX-579462236

0

3,859,113

A

G

2.22

AX-579463414

0

3,861,124

A

G

2.22

AX-579463494

0

3,881,563

T

C

2.22

AX-579463498

0

3,882,589

C

T

2.22

AX-579463587

0

3,895,702

A

G

2.22

AX-579462440

0

3,895,907

T

C

2.22

AX-579463874

0

3,956,685

T

C

2.22

AX-579462720

0

3,966,733

C

T

2.22

AX-579463888

0

3,975,909

C

T

2.22

AX-579464091

0

4,020,869

G

T

2.22

AX-579463106

0

4,064,758

T

C

2.22

AX-579463231

0

4,094,036

T

C

2.22

AX-579464583

0

4,107,741

C

G

2.22

AX-579464787

0

4,212,179

T

C

2.22

AX-579464795

0

4,226,874

C

T

2.22

AX-579463642

0

4,262,471

A

G

2.22

AX-579465027

0

4,314,848

G

A

2.22

AX-579463851

0

4,326,276

A

G

2.22

AX-579463996

0

4,330,431

C

T

2.22

AX-579465698

0

4,393,351

C

T

2.22

AX-579465949

0

4,467,885

C

G

2.22

AX-579466055

0

4,480,968

G

C

2.22

AX-579466082

0

4,489,343

A

C

2.22

AX-579465216

0

4,574,326

T

C

2.22

AX-579465235

0

4,574,813

A

G

2.22

AX-579466528

0

4,587,884

C

G

2.22

AX-579466629

0

4,598,725

T

C

2.22

AX-579465413

0

4,599,659

G

T

2.22

AX-579465498

0

4,629,289

T

A

2.22

AX-579465649

0

4,656,488

T

A

2.22

AX-579466030

0

4,689,200

C

T

2.22

AX-579468753

0

5,318,173

T

C

2.22

AX-579471414

0

5,609,135

A

G

2.22

AX-579471641

0

5,644,918

T

A

2.22

AX-579471677

0

5,645,846

T

C

2.22

AX-579471008

0

5,770,455

A

G

2.22

AX-579471253

0

5,803,284

C

T

2.22

AX-579471334

0

5,819,884

C

G

2.22

AX-579471617

0

5,832,805

A

G

2.22

AX-579471746

0

5,843,378

A

G

2.22

AX-579472640

0

6,037,441

G

T

2.22

AX-579474681

0

6,123,654

C

T

2.22

AX-579474277

0

6,162,084

T

C

2.22

AX-579475628

0

6,170,338

A

G

2.22

AX-579475675

0

6,171,699

T

C

2.22

AX-579474609

0

6,173,500

T

A

2.22

AX-579475863

0

6,177,554

T

C

2.22

AX-579477079

0

6,337,539

T

C

2.22

AX-579477559

0

6,419,058

C

T

2.22

AX-579480488

0

7,113,284

C

T

2.22

AX-579480540

0

7,114,693

C

T

2.22

AX-579481793

0

7,166,834

T

C

2.22

AX-579481954

0

7,473,323

T

C

2.22

AX-579483482

0

7,806,720

G

A

2.22

AX-579486420

0

8,074,399

C

T

2.22

AX-579486555

0

8,186,532

T

C

2.22

AX-579486568

0

8,187,000

T

G

2.22

AX-579490366

0

8,853,756

C

T

2.22

AX-579489177

0

8,870,912

C

G

2.22

AX-579489211

0

8,871,496

A

T

2.22

AX-579489422

0

8,894,417

A

C

2.22

AX-579492297

0

9,511,379

C

T

2.22

AX-579491302

0

9,574,144

G

C

2.22

AX-579493621

0

9,747,187

A

C

2.22

AX-579493195

0

9,878,211

G

A

2.22

AX-579494546

0

9,881,500

G

A

2.22

AX-579497824

0

10,502,539

T

C

2.22

AX-579498357

0

10,769,457

A

C

2.22

AX-579498441

0

10,771,570

A

G

2.22

AX-579499981

0

10,805,036

T

C

2.22

AX-579500777

0

11,005,339

A

G

2.22

AX-579499823

0

11,078,281

A

G

2.22

AX-579501488

0

11,165,465

A

C

2.22

AX-579501509

0

11,486,922

T

C

2.22

AX-579501594

0

11,500,114

T

A

2.22

AX-579501695

0

11,518,427

C

T

2.22

AX-579503052

0

11,521,815

T

C

2.22

AX-579503167

0

11,811,918

A

G

2.22

AX-579504531

0

11,812,599

A

G

2.22

AX-579505825

0

12,322,037

C

T

2.22

AX-579506213

0

12,415,362

C

G

2.27

AX-579514764

0

3,143

A

T

2.27

AX-579514780

0

3,924

A

T

2.27

AX-579514832

0

19,391

T

C

2.27

AX-579513447

0

19,620

T

C

2.27

AX-579515086

0

93,803

G

A

2.27

AX-579515106

0

94,019

A

G

2.27

AX-579516800

0

99,017

A

G

2.27

AX-579521449

0

1,313,549

C

G

2.27

AX-579522956

0

1,325,963

C

T

2.27

AX-579521736

0

1,345,366

G

A

2.27

AX-579525293

0

1,561,644

C

T

2.27

AX-579524022

0

1,563,849

T

C

2.27

AX-579525501

0

1,574,245

A

G

2.27

AX-579528877

0

2,083,014

A

G

2.27

AX-579530612

0

2,656,493

A

G

2.27

AX-579531606

0

3,939,052

T

C

2.27

AX-579533372

0

4,626,751

T

C

2.27

AX-579533771

0

5,190,811

T

C

2.27

AX-579534229

0

5,300,874

A

T

2.27

AX-579533798

0

5,480,935

A

G

2.27

AX-579536022

0

5,626,846

T

C

2.27

AX-579536107

0

5,639,527

C

T

2.27

AX-579535050

0

5,653,677

G

A

2.27

AX-579535085

0

5,654,174

G

A

2.27

AX-579535400

0

5,695,991

T

G

2.27

AX-579537686

0

6,005,191

G

C

2.27

AX-579536482

0

6,021,051

C

G

2.27

AX-579537999

0

6,037,155

T

C

2.27

AX-579536746

0

6,037,770

G

A

2.27

AX-579538092

0

6,266,645

G

A

2.27

AX-579539919

0

6,627,914

A

C

2.27

AX-579544247

0

7,482,829

T

C

2.27

AX-579545575

0

7,483,880

T

G

2.27

AX-579546394

0

7,837,393

A

G

2.27

AX-579548165

0

7,946,950

T

C

2.27

AX-579548233

0

7,958,035

A

C

2.27

AX-579547370

0

8,017,758

C

T

2.27

AX-579547376

0

8,017,979

G

C

2.32

AX-579553752

0

2,293,932

T

C

2.36

AX-579554924

0

122,073

T

C

2.36

AX-579555022

0

122,978

A

G

2.36

AX-579554313

0

177,712

G

A

2.36

AX-579554700

0

233,093

G

A

2.36

AX-579555223

0

316,293

G

C

2.36

AX-579556352

0

966,741

A

G

2.36

AX-579556686

0

1,233,251

A

G

2.36

AX-579558663

0

1,923,334

G

A

2.36

AX-579560288

0

1,988,456

G

A

2.36

AX-579560662

0

2,006,981

C

T

2.36

AX-579559480

0

2,010,914

A

C

2.36

AX-579560965

0

2,023,214

A

G

2.36

AX-579559703

0

2,024,629

A

G

2.36

AX-579561712

0

2,103,073

A

G

2.36

AX-579560506

0

2,117,189

G

T

2.36

AX-579560776

0

2,159,466

C

T

2.36

AX-579561033

0

2,360,547

T

G

2.36

AX-579561118

0

2,375,602

A

C

2.36

AX-579561629

0

2,476,040

T

C

2.36

AX-579562408

0

2,742,031

C

T

2.36

AX-579564629

0

2,977,222

C

T

2.36

AX-579563611

0

3,039,586

G

A

2.36

AX-579564982

0

3,063,901

G

A

2.36

AX-579565814

0

3,225,773

A

G

2.36

AX-579566111

0

3,499,011

G

A

2.36

AX-579566130

0

3,499,260

T

G

2.36

AX-579566954

0

3,775,104

A

G

2.36

AX-579568657

0

3,855,224

G

T

2.36

AX-579568951

0

3,924,403

T

G

2.36

AX-579571176

0

4,429,492

C

T

2.36

AX-579571698

0

4,506,577

G

A

2.36

AX-579570501

0

4,525,604

A

G

2.36

AX-579571897

0

4,525,871

A

G

2.36

AX-579571606

0

4,722,263

G

T

2.36

AX-579573008

0

4,722,473

G

A

2.36

AX-579571640

0

4,722,920

T

G

2.36

AX-579571741

0

4,732,500

T

C

2.36

AX-579571778

0

4,733,288

C

A

2.36

AX-579571885

0

4,735,900

T

G

2.36

AX-579572182

0

4,771,453

T

C

2.36

AX-579572348

0

4,773,049

A

G

2.36

AX-579575218

0

4,954,626

T

C

2.36

AX-579574417

0

5,093,568

A

C

2.36

AX-579576372

0

5,174,565

G

C

2.36

AX-579575014

0

5,182,212

G

A

2.36

AX-579576494

0

5,195,710

A

G

2.36

AX-579576620

0

5,220,518

C

T

2.36

AX-579575264

0

5,221,019

T

C

2.36

AX-579576700

0

5,232,144

T

A

2.36

AX-579578676

0

5,871,547

C

A

2.36

AX-579578960

0

5,905,142

A

C

2.36

AX-579579230

0

5,977,765

C

T

2.36

AX-579580763

0

6,013,782

A

T

2.36

AX-579579387

0

6,044,877

A

G

2.36

AX-579579808

0

6,131,092

C

T

2.36

AX-579580108

0

6,183,757

G

A

2.36

AX-579580428

0

6,264,997

T

G

2.36

AX-579582210

0

6,366,093

C

T

2.36

AX-579580737

0

6,374,847

C

T

2.36

AX-579580782

0

6,399,348

G

C

2.36

AX-579582325

0

6,409,036

C

T

2.36

AX-579582344

0

6,424,187

G

T

2.36

AX-579582396

0

6,451,287

A

G

2.36

AX-579581051

0

6,471,713

G

A

2.36

AX-579582591

0

6,471,983

T

C

2.36

AX-579581085

0

6,472,487

G

A

2.36

AX-579582777

0

6,484,066

T

A

2.36

AX-579581842

0

6,601,224

T

C

2.36

AX-579582896

0

6,815,052

T

C

2.36

AX-579584795

0

6,840,596

T

G

2.36

AX-579585088

0

6,864,648

G

A

2.36

AX-579585247

0

6,884,462

A

G

2.36

AX-579585413

0

6,913,172

G

A

2.36

AX-579585547

0

6,979,869

A

G

2.36

AX-579584631

0

7,310,698

G

A

2.36

AX-579586817

0

7,524,611

C

T

2.36

AX-579587054

0

7,555,928

A

T

2.36

AX-579587132

0

7,558,468

A

G

2.36

AX-579585816

0

7,598,150

T

C

2.36

AX-579586639

0

7,738,695

A

G

2.36

AX-579588150

0

7,739,652

G

A

2.36

AX-579588280

0

7,769,335

C

T

2.36

AX-579588443

0

7,785,936

G

T

2.36

AX-579588768

0

7,842,421

C

A

2.36

AX-579588851

0

7,847,327

C

T

2.36

AX-579589139

0

7,863,252

C

T

2.36

AX-579589153

0

7,863,471

A

G

2.36

AX-579589060

0

8,163,406

G

A

2.36

AX-579591399

0

8,368,205

T

C

2.38

AX-579592977

0

33,173

T

C

2.38

AX-579591594

0

105,906

T

G

2.38

AX-579591718

0

138,430

C

T

2.38

AX-579593540

0

225,032

T

G

2.38

AX-579593773

0

307,640

A

G

2.38

AX-579592307

0

307,963

T

C

2.38

AX-579593876

0

332,935

A

C

2.38

AX-579592773

0

808,486

T

C

2.38

AX-579594255

0

808,711

C

T

2.38

AX-579593116

0

904,947

A

G

2.38

AX-579593401

0

1,012,549

C

T

2.38

AX-579595097

0

1,068,943

A

G

2.38

AX-579593808

0

1,253,358

C

T

2.38

AX-579595377

0

1,435,426

A

G

2.38

AX-579594123

0

1,555,916

G

A

2.38

AX-579595929

0

1,745,456

T

G

2.38

AX-579595131

0

1,890,450

C

G

2.38

AX-579596432

0

2,092,493

T

C

2.38

AX-579596460

0

2,093,042

A

C

2.38

AX-579596587

0

2,133,235

A

G

2.38

AX-579597760

0

2,412,837

G

A

2.38

AX-579600249

0

2,643,323

G

C

2.38

AX-579598791

0

2,679,893

C

A

2.38

AX-579598805

0

2,680,377

A

G

2.38

AX-579600802

0

2,768,268

A

G

2.38

AX-579599348

0

2,769,292

A

G

2.38

AX-579601954

0

3,420,287

C

T

2.38

AX-579605417

0

3,866,591

C

T

2.38

AX-579605652

0

3,869,671

G

A

2.38

AX-579605090

0

4,025,293

T

C

2.38

AX-579605155

0

4,034,581

T

C

2.38

AX-579605258

0

4,059,911

G

C

2.38

AX-579605997

0

4,217,138

T

C

2.38

AX-579606037

0

4,233,102

T

C

2.38

AX-579606082

0

4,261,878

A

C

2.38

AX-579606324

0

4,321,442

C

T

2.38

AX-579606484

0

4,362,367

T

C

2.38

AX-579607967

0

4,365,528

A

G

2.40

AX-579609322

0

7,757

T

G

2.40

AX-579607994

0

22,781

C

T

2.40

AX-579608017

0

23,519

A

C

2.40

AX-579608372

0

63,601

T

C

2.40

AX-579609853

0

63,812

A

G

2.40

AX-579608454

0

66,117

A

G

2.40

AX-579610235

0

117,199

C

T

2.40

AX-579610308

0

269,848

A

G

2.40

AX-579610990

0

314,487

T

C

2.40

AX-579610992

0

314,726

G

A

2.40

AX-579609938

0

391,207

A

G

2.40

AX-579611740

0

476,232

G

A

2.40

AX-579611969

0

535,273

G

A

2.40

AX-579612052

0

555,521

C

A

2.40

AX-579610585

0

556,029

T

C

2.40

AX-579612219

0

589,415

T

C

2.40

AX-579610845

0

611,373

T

C

2.40

AX-579611070

0

637,163

A

T

2.40

AX-579612655

0

659,602

C

A

2.40

AX-579611192

0

668,647

T

G

2.40

AX-579613074

0

747,694

A

G

2.40

AX-579613337

0

825,033

T

G

2.40

AX-579613453

0

836,220

G

C

2.40

AX-579612387

0

894,866

A

G

2.40

AX-579614469

0

1,027,270

C

T

2.40

AX-579612967

0

1,027,987

A

G

2.40

AX-579613089

0

1,031,879

G

A

2.40

AX-579614661

0

1,032,599

G

A

2.40

AX-579614672

0

1,032,875

A

G

2.40

AX-579613249

0

1,035,703

G

A

2.40

AX-579615168

0

1,075,172

G

C

2.40

AX-579615300

0

1,086,477

A

G

2.40

AX-579615327

0

1,087,851

A

G

2.40

AX-579615710

0

1,164,681

G

A

2.40

AX-579616152

0

1,273,262

C

A

2.40

AX-579616931

0

1,384,904

C

T

2.40

AX-579615476

0

1,387,460

T

C

2.40

AX-579615529

0

1,407,741

T

C

2.40

AX-579617096

0

1,410,210

T

A

2.40

AX-579617194

0

1,412,279

A

G

2.40

AX-579615703

0

1,412,642

A

G

2.40

AX-579617599

0

1,468,946

A

C

2.40

AX-579617368

0

1,651,660

T

C

2.40

AX-579619072

0

1,665,441

T

C

2.40

AX-579617534

0

1,665,651

A

C

2.40

AX-579618424

0

1,904,505

G

T

2.40

AX-579619997

0

1,910,958

T

C

2.40

AX-579620126

0

1,955,336

T

C

2.40

AX-579618630

0

1,972,960

C

T

2.40

AX-579618760

0

1,995,445

A

G

2.40

AX-579620866

0

2,124,643

G

A

2.40

AX-579619588

0

2,186,172

A

G

2.40

AX-579619840

0

2,225,961

T

C

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "ldna", "clusters_14_aut.docx"))

Let’s check cluster 6_19_42 on AUT chr1

aut_ch1 <- readRDS(here("output", "ldna", "pop", "chr1", "AUT_clusters_snps.rds"))
str(aut_ch1$`1942_0.64`)
##  chr [1:418] "AX-583500511" "AX-583500520" "AX-583503153" "AX-583500379" ...
# We have 566 SNPs on this cluster
cluster_6 <- snps[snps$SNP %in% aut_ch1$`1942_0.64`, ]
head(cluster_6) # 316 SNPs
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.152    AX-583500511     0   339500 G       C      
## 2 1.152    AX-583500520     0   345870 A       G      
## 3 1.152    AX-583503153     0   977944 T       G      
## 4 1.152    AX-583500379     0   978467 G       T      
## 5 1.152    AX-583503258     0   987625 C       T      
## 6 1.152    AX-583504302     0  1149628 A       G

Create table

# Create the flextable
my_flextable <- flextable(cluster_6)

my_flextable <- autofit(my_flextable)

# Display the flextable
my_flextable

Scaffold

SNP

Cm

Position

Allele1

Allele2

1.152

AX-583500511

0

339,500

G

C

1.152

AX-583500520

0

345,870

A

G

1.152

AX-583503153

0

977,944

T

G

1.152

AX-583500379

0

978,467

G

T

1.152

AX-583503258

0

987,625

C

T

1.152

AX-583504302

0

1,149,628

A

G

1.152

AX-583503279

0

1,491,640

G

C

1.152

AX-583503440

0

1,535,179

A

G

1.152

AX-583503906

0

1,564,958

T

C

1.152

AX-583506777

0

1,565,216

C

G

1.152

AX-583504295

0

1,602,992

G

A

1.152

AX-583507197

0

1,603,711

G

T

1.152

AX-583507425

0

1,627,574

G

C

1.152

AX-583507639

0

1,644,594

T

C

1.152

AX-583507878

0

1,664,015

G

A

1.152

AX-583505073

0

1,664,826

T

C

1.152

AX-583508045

0

1,665,840

C

A

1.152

AX-583508498

0

1,669,803

T

C

1.152

AX-583508594

0

1,670,439

G

A

1.152

AX-583508771

0

1,673,676

T

C

1.152

AX-583505971

0

1,673,937

C

T

1.152

AX-583506892

0

1,803,316

G

C

1.152

AX-583511182

0

1,955,809

A

G

1.152

AX-583508519

0

1,966,895

C

T

1.152

AX-583508573

0

1,971,684

T

C

1.152

AX-583509292

0

2,556,372

T

C

1.152

AX-583509391

0

2,580,817

T

C

1.152

AX-583509412

0

2,581,274

A

G

1.152

AX-583509733

0

2,598,090

G

T

1.152

AX-583512511

0

2,598,734

A

G

1.152

AX-583510076

0

2,619,288

C

A

1.152

AX-583512897

0

2,625,567

G

C

1.152

AX-583513248

0

2,655,189

T

G

1.152

AX-583513308

0

2,666,934

T

G

1.152

AX-583513359

0

2,668,625

A

G

1.152

AX-583513377

0

2,668,892

A

G

1.152

AX-583510794

0

2,701,081

A

G

1.152

AX-583510903

0

2,706,078

C

T

1.152

AX-583513648

0

2,725,250

C

T

1.152

AX-583511483

0

2,848,038

G

A

1.152

AX-583514235

0

2,848,259

T

G

1.152

AX-583514359

0

2,888,182

A

G

1.152

AX-583514397

0

2,900,388

A

G

1.152

AX-583511792

0

2,913,109

T

C

1.152

AX-583512377

0

3,056,913

G

A

1.152

AX-583512666

0

3,117,848

C

A

1.152

AX-583515819

0

3,174,781

A

G

1.152

AX-583513555

0

3,382,178

T

C

1.152

AX-583513773

0

3,464,938

G

A

1.152

AX-583517814

0

3,775,044

T

C

1.152

AX-583519450

0

4,112,020

T

C

1.152

AX-583521326

0

5,078,954

C

A

1.152

AX-583521530

0

5,098,043

T

C

1.152

AX-583525764

0

5,184,643

T

C

1.152

AX-583527475

0

5,423,719

G

T

1.152

AX-583527695

0

5,449,431

G

A

1.152

AX-583526378

0

5,567,513

G

A

1.152

AX-583529290

0

5,577,967

C

T

1.152

AX-583530453

0

5,927,434

T

C

1.152

AX-583528261

0

6,093,153

A

G

1.152

AX-583531073

0

6,102,875

T

C

1.152

AX-583528689

0

6,128,044

T

C

1.152

AX-583530679

0

6,478,224

G

A

1.152

AX-583533848

0

6,554,566

A

G

1.152

AX-583532801

0

6,856,535

T

A

1.152

AX-583533673

0

7,055,034

T

C

1.152

AX-583536382

0

7,055,500

A

G

1.152

AX-583536435

0

7,055,779

C

T

1.152

AX-583534984

0

7,166,546

A

C

1.152

AX-583535005

0

7,166,793

G

C

1.152

AX-583537871

0

7,168,782

T

C

1.152

AX-583538179

0

7,190,150

A

G

1.152

AX-583536224

0

7,313,002

A

G

1.152

AX-583538596

0

7,562,522

C

T

1.152

AX-583539329

0

7,641,844

T

C

1.152

AX-583539476

0

7,657,914

T

G

1.152

AX-583542283

0

7,660,371

C

T

1.152

AX-583542500

0

7,716,798

G

A

1.152

AX-583542697

0

7,766,866

T

C

1.154

AX-583542132

0

533,422

G

A

1.156

AX-583547648

0

11,722

C

G

1.156

AX-583544949

0

74,115

C

T

1.170

AX-583556250

0

197,780

A

C

1.170

AX-583560886

0

919,858

C

T

1.170

AX-583571044

0

1,622,696

T

C

1.170

AX-583571065

0

1,623,404

T

G

1.170

AX-583571174

0

1,631,928

A

G

1.170

AX-583571507

0

2,091,806

A

C

# Initialize a Word document
doc <- read_docx()

# Add flextable to Word document
doc <- body_add_flextable(doc, value = my_flextable)

# Save the Word document
print(doc, target = here("output", "ldna", "clusters_6_aut.docx"))
cluster_6_all <- snps[snps$Scaffold %in% cluster_6$Scaffold, ]
head(cluster_6_all)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.152    AX-583496546     0    28493 T       C      
## 2 1.152    AX-583496607     0    43318 A       G      
## 3 1.152    AX-583499574     0    96763 G       A      
## 4 1.152    AX-583499672     0   132155 T       G      
## 5 1.152    AX-583499700     0   136348 A       G      
## 6 1.152    AX-583499977     0   188640 T       A

However we need the other SNPs to know if the blocks are connected or not. Now we know what scaffolds the SNPs are located and we can check it

cluster_14_all <- snps[snps$Scaffold %in% cluster_14$Scaffold, ]
head(cluster_14_all)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 2.22     AX-579445128     0    49751 G       A      
## 2 2.22     AX-579445195     0    64564 A       T      
## 3 2.22     AX-579443938     0   100572 A       G      
## 4 2.22     AX-579445430     0   157266 G       T      
## 5 2.22     AX-579445592     0   202795 T       A      
## 6 2.22     AX-579444301     0   212193 C       T

Now we can create a new column and tag the SNPs that are linked

# Creating the new column 'linked'
cluster_14_all <- cluster_14_all %>%
  mutate(linked = SNP %in% aut_ch2$`3139_0.78`)
head(cluster_14_all)
## # A tibble: 6 × 7
##   Scaffold SNP             Cm Position Allele1 Allele2 linked
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>   <lgl> 
## 1 2.22     AX-579445128     0    49751 G       A       FALSE 
## 2 2.22     AX-579445195     0    64564 A       T       FALSE 
## 3 2.22     AX-579443938     0   100572 A       G       FALSE 
## 4 2.22     AX-579445430     0   157266 G       T       FALSE 
## 5 2.22     AX-579445592     0   202795 T       A       FALSE 
## 6 2.22     AX-579444301     0   212193 C       T       FALSE
# Creating the new column 'linked'
cluster_6_all <- cluster_6_all %>%
  mutate(linked = SNP %in% aut_ch1$`1942_0.64`)
head(cluster_6_all)
## # A tibble: 6 × 7
##   Scaffold SNP             Cm Position Allele1 Allele2 linked
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>   <lgl> 
## 1 1.152    AX-583496546     0    28493 T       C       FALSE 
## 2 1.152    AX-583496607     0    43318 A       G       FALSE 
## 3 1.152    AX-583499574     0    96763 G       A       FALSE 
## 4 1.152    AX-583499672     0   132155 T       G       FALSE 
## 5 1.152    AX-583499700     0   136348 A       G       FALSE 
## 6 1.152    AX-583499977     0   188640 T       A       FALSE

Lets check how many linked SNPs we have and how many aren’t

count_linked <- cluster_14_all %>%
  group_by(linked) %>%
  summarise(count = n())
count_linked
## # A tibble: 2 × 2
##   linked count
##   <lgl>  <int>
## 1 FALSE   2657
## 2 TRUE     316
count_linked <- cluster_6_all %>%
  group_by(linked) %>%
  summarise(count = n())
count_linked
## # A tibble: 2 × 2
##   linked count
##   <lgl>  <int>
## 1 FALSE   1270
## 2 TRUE      88

Get the 157 outliers

snps_157 <-
  read.table(
    here("output", "pcadapt", "outlier_157_SNPs.txt"),
    stringsAsFactors = FALSE
    )

# Get the 157 SNPs 
snps_157b <- cluster_14_all |> 
  filter(SNP %in% snps_157$V1)

head(snps_157b)
## # A tibble: 6 × 7
##   Scaffold SNP             Cm Position Allele1 Allele2 linked
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>   <lgl> 
## 1 2.22     AX-579474142     0  6075268 T       C       FALSE 
## 2 2.22     AX-579474650     0  6122430 A       G       FALSE 
## 3 2.22     AX-579509845     0 13142038 A       T       FALSE 
## 4 2.36     AX-579556477     0  1062758 T       C       FALSE 
## 5 2.36     AX-579560686     0  2139679 C       T       FALSE 
## 6 2.36     AX-579564292     0  2894537 A       G       FALSE
# Get the 157 SNPs 
snps_157c <- cluster_6_all |> 
  filter(SNP %in% snps_157$V1)

head(snps_157c)
## # A tibble: 6 × 7
##   Scaffold SNP             Cm Position Allele1 Allele2 linked
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>   <lgl> 
## 1 1.152    AX-583504302     0  1149628 A       G       TRUE  
## 2 1.152    AX-583514148     0  2847295 T       C       FALSE 
## 3 1.152    AX-583515734     0  3903190 A       G       FALSE 
## 4 1.152    AX-583518586     0  3966380 T       C       FALSE 
## 5 1.152    AX-583518994     0  4024772 A       G       FALSE 
## 6 1.152    AX-583516491     0  4057192 T       C       FALSE

Plot it

# Function to format numbers as Mb
label_mb <- function(x) {
  sprintf("%.0fMb", x / 1e6)
}


# Calculate the start and end positions for each stretch of TRUE or FALSE
rect_data <- cluster_14_all %>%
  arrange(Scaffold, Position) %>%
  mutate(change = linked != lag(linked, default = first(linked))) %>%
  group_by(Scaffold) %>%
  mutate(group_id = cumsum(change)) %>%
  group_by(Scaffold, group_id, linked) %>%
  summarize(start = min(Position), end = max(Position), .groups = 'drop') %>%
  ungroup()

# Plotting using geom_rect
ggplot(rect_data, aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1)) +
  # Add a background layer for each scaffold with gray color
  geom_rect(data = rect_data %>% group_by(Scaffold) %>% 
            summarize(start = min(start), end = max(end), .groups = 'drop'), 
            aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1), 
            fill = "gray", inherit.aes = FALSE) +
  # Add the TRUE stretches
  geom_rect(aes(fill = linked), data = rect_data %>% filter(linked == TRUE)) +
  scale_fill_manual(values = c("TRUE" = "green")) +
  facet_wrap(~ Scaffold, scales = "fixed", ncol = 1) +
  geom_vline(data = snps_157b, aes(xintercept = Position, color = SNP), linetype = "solid", color = "red") +
  geom_text(data = snps_157b, aes(x = Position, y = as.numeric(Scaffold) + 0.2, label = SNP), inherit.aes = FALSE, angle = 90, vjust = 0, size = 2, check_overlap = TRUE) +
  theme_minimal() +
  theme(
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank(), 
    panel.spacing = unit(0.1, "lines"),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    strip.text = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(t = 1, b = 5))
  ) +
  scale_x_continuous(labels = label_mb, breaks = scales::pretty_breaks(n = 6)) +
  labs(x = "Position", title = "Continuous Stretches of Linked SNPs by Scaffold - Cluster 14 AUT", fill = "Linked") +
  guides(fill = "none")

# Save Venn diagram to PDF
output_path <- here("output", "ldna", "figures", "cluster_14_aut_scaffolds.pdf")
ggsave(output_path, height = 5, width = 8, units = "in")

To make sure the windows appear even if they are small

# Set a minimum width for the windows
min_width <- 1e4  # for example, 1 million base pairs

# Adjust the rect_data calculation
rect_data <- cluster_14_all %>%
  arrange(Scaffold, Position) %>%
  mutate(change = linked != lag(linked, default = first(linked))) %>%
  group_by(Scaffold) %>%
  mutate(group_id = cumsum(change)) %>%
  group_by(Scaffold, group_id, linked) %>%
  summarize(start = min(Position), end = max(Position), .groups = 'drop') %>%
  ungroup() %>%
  mutate(width = end - start,
         adjusted_end = ifelse(width < min_width, start + min_width, end))

# Plotting using geom_rect with adjusted ends
ggplot(rect_data, aes(xmin = start, xmax = adjusted_end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1)) +
  geom_rect(data = rect_data %>% group_by(Scaffold) %>% 
            summarize(start = min(start), end = max(adjusted_end), .groups = 'drop'), 
            aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1), 
            fill = "gray", inherit.aes = FALSE) +
  geom_rect(aes(fill = linked), data = rect_data %>% filter(linked == TRUE)) +
  scale_fill_manual(values = c("TRUE" = "green")) +
  facet_wrap(~ Scaffold, scales = "fixed", ncol = 1) +
  theme_minimal() +
  geom_vline(data = snps_157b, aes(xintercept = Position, color = SNP), linetype = "solid", color = "red") +
  geom_text_repel(data = snps_157b, aes(x = Position, y = as.numeric(Scaffold) + 0.2, label = SNP), 
                  inherit.aes = FALSE, angle = 45, size = 2, 
                  nudge_y = 0.1,   # Adjust this value as needed
                  direction = "y") +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), panel.spacing = unit(0.1, "lines"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        strip.text = element_text(size = 12, face = "bold", hjust = 0.5)) +
  scale_x_continuous(labels = label_mb, breaks = scales::pretty_breaks(n = 6)) +
  labs(x = "Position", title = "Continuous Stretches of Linked SNPs by Scaffold - Cluster 14 AUT", fill = "Linked") +
  guides(fill = "none")
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees

# Save Venn diagram to PDF
output_path <- here("output", "ldna", "figures", "cluster_14_aut_scaffolds_with_small_windows.pdf")
ggsave(output_path, height = 5, width = 8, units = "in")
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees
## ggrepel: Repulsion works correctly only for rotation angles multiple of 90 degrees

Cluster 6

# Function to format numbers as Mb
label_mb <- function(x) {
  sprintf("%.0fMb", x / 1e6)
}


# Calculate the start and end positions for each stretch of TRUE or FALSE
rect_data <- cluster_6_all %>%
  arrange(Scaffold, Position) %>%
  mutate(change = linked != lag(linked, default = first(linked))) %>%
  group_by(Scaffold) %>%
  mutate(group_id = cumsum(change)) %>%
  group_by(Scaffold, group_id, linked) %>%
  summarize(start = min(Position), end = max(Position), .groups = 'drop') %>%
  ungroup()

# Plotting using geom_rect
ggplot(rect_data, aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1)) +
  # Add a background layer for each scaffold with gray color
  geom_rect(data = rect_data %>% group_by(Scaffold) %>% 
            summarize(start = min(start), end = max(end), .groups = 'drop'), 
            aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1), 
            fill = "gray", inherit.aes = FALSE) +
  # Add the TRUE stretches
  geom_rect(aes(fill = linked), data = rect_data %>% filter(linked == TRUE)) +
  scale_fill_manual(values = c("TRUE" = "green")) +
  facet_wrap(~ Scaffold, scales = "fixed", ncol = 1) +
  geom_vline(data = snps_157c, aes(xintercept = Position, color = SNP), linetype = "solid", color = "red") +
  geom_text(data = snps_157c, aes(x = Position, y = as.numeric(Scaffold) + 0.2, label = SNP), inherit.aes = FALSE, angle = 90, vjust = 0, size = 2, check_overlap = TRUE) +
  theme_minimal() +
  theme(
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank(), 
    panel.spacing = unit(0.1, "lines"),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    strip.text = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(t = 1, b = 5))
  ) +
  scale_x_continuous(labels = label_mb, breaks = scales::pretty_breaks(n = 6)) +
  labs(x = "Position", title = "Continuous Stretches of Linked SNPs by Scaffold - Cluster 6 AUT", fill = "Linked") +
  guides(fill = "none")

# Save Venn diagram to PDF
output_path <- here("output", "ldna", "figures", "cluster_6_aut_scaffolds.pdf")
ggsave(output_path, height = 5, width = 8, units = "in")

To make sure the windows appear even if they are small

# Set a minimum width for the windows
min_width <- 1e4  # for example, 1 million base pairs

# Adjust the rect_data calculation
rect_data <- cluster_6_all %>%
  arrange(Scaffold, Position) %>%
  mutate(change = linked != lag(linked, default = first(linked))) %>%
  group_by(Scaffold) %>%
  mutate(group_id = cumsum(change)) %>%
  group_by(Scaffold, group_id, linked) %>%
  summarize(start = min(Position), end = max(Position), .groups = 'drop') %>%
  ungroup() %>%
  mutate(width = end - start,
         adjusted_end = ifelse(width < min_width, start + min_width, end))

# Plotting using geom_rect with adjusted ends
ggplot(rect_data, aes(xmin = start, xmax = adjusted_end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1)) +
  geom_rect(data = rect_data %>% group_by(Scaffold) %>% 
            summarize(start = min(start), end = max(adjusted_end), .groups = 'drop'), 
            aes(xmin = start, xmax = end, ymin = as.numeric(Scaffold) - 0.1, ymax = as.numeric(Scaffold) + 0.1), 
            fill = "gray", inherit.aes = FALSE) +
  geom_rect(aes(fill = linked), data = rect_data %>% filter(linked == TRUE)) +
  scale_fill_manual(values = c("TRUE" = "green")) +
  facet_wrap(~ Scaffold, scales = "fixed", ncol = 1) +
  theme_minimal() +
  geom_vline(data = snps_157c, aes(xintercept = Position, color = SNP), linetype = "solid", color = "red") +
  geom_text_repel(data = snps_157c, aes(x = Position, y = as.numeric(Scaffold) + 0.2, label = SNP), 
                  inherit.aes = FALSE, angle = 45, size = 2, 
                  nudge_y = 0.1,   # Adjust this value as needed
                  direction = "y") +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), panel.spacing = unit(0.1, "lines"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        strip.text = element_text(size = 12, face = "bold", hjust = 0.5)) +
  scale_x_continuous(labels = label_mb, breaks = scales::pretty_breaks(n = 6)) +
  labs(x = "Position", title = "Continuous Stretches of Linked SNPs by Scaffold - Cluster 6 AUT", fill = "Linked") +
  guides(fill = "none")
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple
## of 90 degrees

# Save Venn diagram to PDF
output_path <- here("output", "ldna", "figures", "cluster_6_aut_scaffolds_with_small_windows.pdf")
ggsave(output_path, height = 5, width = 8, units = "in")
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple
## of 90 degrees

Calculate size

# Calculate the length of each stretch
rect_data <- rect_data %>%
  mutate(length = end - start + 1)

# Calculate average sizes for TRUE and FALSE stretches
average_sizes <- rect_data %>%
  group_by(linked) %>%
  summarize(average_size = mean(length))

# Display the average sizes
print(average_sizes)
## # A tibble: 2 × 2
##   linked average_size
##   <lgl>         <dbl>
## 1 FALSE       138014.
## 2 TRUE          1044.

We can also check what genes are in these scaffolds.

Import the data

snps_genes_chr <- readRDS(here("output", "ldna", "snps_genes_chr.rds"))

The expression data

gene_expression <- read_delim(here("data", "files","MANvsAUTO_sig_mRNAs.csv"), delim = ",", col_names = TRUE, show_col_types = FALSE) |>
  dplyr::select(
    gene,log2FoldChange 
  ) |>
  dplyr::rename(
    Gene_ID = gene
  )
head(gene_expression)
## # A tibble: 6 × 2
##   Gene_ID      log2FoldChange
##   <chr>                 <dbl>
## 1 LOC115262812         -10.5 
## 2 LOC109401291          -9.19
## 3 LOC109397830           8.73
## 4 LOC115264022           8.56
## 5 LOC115260314           8.51
## 6 LOC115258723           8.39

For example, what genes are in the cluster 14, so we can check what genes are in these scaffolds or more specifically what SNPs are in the genes

First, what genes are in these scaffolds?

cluster_14_genes <- snps_genes_chr[snps_genes_chr$Scaffold %in% cluster_14_all$Scaffold, ]

# How many genes in the scaffolds
length(unique(cluster_14_genes$Gene_ID))
## [1] 321

Now lets check what genes have the linked SNPs

cluster_14_snps2 <- snps_genes_chr[snps_genes_chr$SNP %in% aut_ch2$`3139_0.78`, ]

# How many genes with SNPs from the ld block
length(unique(cluster_14_snps2$Gene_ID))
## [1] 109

Save as Excel file

# Save the data frame to an Excel file
write_xlsx(cluster_14_snps2, here("output", "ldna", "cluster_14_snps2.xlsx"))

We can count how many linked SNPs per gene

# Count the number of SNPs for each Gene_ID
snp_count_per_gene <- cluster_14_snps2 %>%
  group_by(Gene_ID) %>%
  summarise(SNP_count = n())

# View the first few rows of the result
head(snp_count_per_gene)
## # A tibble: 6 × 2
##   Gene_ID      SNP_count
##   <chr>            <int>
## 1 LOC109397326         1
## 2 LOC109397337         1
## 3 LOC109397339         1
## 4 LOC109397340         1
## 5 LOC109397344         2
## 6 LOC109397369         4

Save as Excel file

# Save the data frame to an Excel file
write_xlsx(snp_count_per_gene, here("output", "ldna", "snp_count_per_gene.xlsx"))

Now we can check how many DE genes are among the 109 genes that have SNPs on the cluster 14

cluster_14_snps3 <- cluster_14_snps2[cluster_14_snps2$Gene_ID %in% gene_expression$Gene_ID, ]

# How many genes in the scaffolds
length(unique(cluster_14_snps3$Gene_ID))
## [1] 3
unique(cluster_14_snps3$Gene_ID)
## [1] "LOC109415738" "LOC109415743" "LOC109414739"

Get the 17 SNPs position

# Get the 17 SNPs 
genes_17_snps <- snps_genes_chr |> 
  dplyr::filter(SNP %in% snps_157b$SNP)

genes_17_snps <- snps_genes_chr[snps_genes_chr$SNP %in% snps_157b$SNP, ]

head(genes_17_snps)
## # A tibble: 6 × 8
##   SNP          Chromosome Position_chr Scaffold Position Gene_ID    Start    End
##   <chr>        <chr>             <dbl> <chr>       <dbl> <chr>      <dbl>  <dbl>
## 1 AX-579474142 2             368006161 2.22      6075268 LOC10941… 6.07e6 6.08e6
## 2 AX-579474650 2             368053323 2.22      6122430 LOC10941… 6.12e6 6.19e6
## 3 AX-579509845 2             375072931 2.22     13142038 LOC10939… 1.25e7 1.31e7
## 4 AX-579556477 2             390614565 2.36      1062758 LOC10939… 9.66e5 1.06e6
## 5 AX-579580051 2             395422791 2.36      5870984 LOC10940… 5.87e6 5.90e6
## 6 AX-579584369 2             396355776 2.36      6803969 LOC10941… 6.71e6 6.82e6

Save as Excel file

# Save the data frame to an Excel file
write_xlsx(genes_17_snps, here("output", "ldna", "cluster_14_17_snps.xlsx"))

Get the SNPs for which we do not have genes

genes_17_snps2 <- snps_157b |> 
  dplyr::filter(!(SNP %in% genes_17_snps$SNP))
head(genes_17_snps2)
## # A tibble: 6 × 7
##   Scaffold SNP             Cm Position Allele1 Allele2 linked
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>   <lgl> 
## 1 2.36     AX-579560686     0  2139679 C       T       FALSE 
## 2 2.36     AX-579564292     0  2894537 A       G       FALSE 
## 3 2.36     AX-579565469     0  3152089 A       T       FALSE 
## 4 2.38     AX-579596233     0  2030415 A       G       FALSE 
## 5 2.38     AX-579602153     0  3099174 G       A       FALSE 
## 6 2.38     AX-579604213     0  3599115 T       C       FALSE

Save it (17 genes)

write.table(
  snps_157b,
  file      = here(
    "output", "ldna", "snps_157b.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)